Niharmahesh commited on
Commit
efabaac
·
verified ·
1 Parent(s): 6e173f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -51
app.py CHANGED
@@ -55,8 +55,6 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
55
  HF_USERNAME = st.secrets["HF_USERNAME"]
56
  DATASET_NAME = "jobeasz"
57
 
58
-
59
-
60
  @st.cache_data(ttl=3600)
61
  def load_and_concat_data():
62
  api = HfApi()
@@ -68,64 +66,56 @@ def load_and_concat_data():
68
  try:
69
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
70
 
71
- # Use PyArrow to read CSV
72
- read_options = csv.ReadOptions(column_names=[
73
- 'site', 'job_url', 'title', 'company', 'location',
74
- 'job_type', 'date_posted', 'is_remote', 'company_url'
75
- ])
76
- parse_options = csv.ParseOptions(delimiter=',')
77
- convert_options = csv.ConvertOptions(
78
- timestamp_parsers=['%Y-%m-%d']
79
- )
 
 
 
 
 
 
 
 
 
 
80
 
81
- table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
82
- all_data.append(table)
 
 
83
  except Exception as e:
84
  print(f"Error processing file {file}: {str(e)}")
 
85
 
86
  if not all_data:
87
- return pa.Table.from_pandas(pd.DataFrame())
88
-
89
- # Concatenate all tables
90
- concatenated_table = pa.concat_tables(all_data)
91
-
92
- # Filter for 2024 data
93
- mask = pc.year(concatenated_table['date_posted']) == 2024
94
- filtered_table = concatenated_table.filter(mask)
95
-
96
- # Convert titles and company names to lowercase
97
- filtered_table = filtered_table.set_column(
98
- filtered_table.schema.get_field_index('title'),
99
- 'title',
100
- pc.utf8_lower(filtered_table['title'])
101
- )
102
- filtered_table = filtered_table.set_column(
103
- filtered_table.schema.get_field_index('company'),
104
- 'company',
105
- pc.utf8_lower(filtered_table['company'])
106
- )
107
 
108
- # Clean location
 
 
 
 
 
 
 
 
109
  def clean_location(location):
110
- if location is None:
111
- return None
112
- location = location.lower()
113
  return re.sub(r',\s*(us|usa)$', '', location)
114
 
115
- cleaned_locations = pc.map(filtered_table['location'], clean_location)
116
- filtered_table = filtered_table.set_column(
117
- filtered_table.schema.get_field_index('location'),
118
- 'location',
119
- cleaned_locations
120
- )
121
-
122
- # Remove duplicates
123
- filtered_table = filtered_table.group_by(filtered_table.column_names).aggregate([])
124
-
125
- # Convert to pandas DataFrame for compatibility with the rest of your code
126
- filtered_df = filtered_table.to_pandas()
127
-
128
- return filtered_df
129
 
130
  @st.cache_data()
131
  def get_unique_values(df):
 
55
  HF_USERNAME = st.secrets["HF_USERNAME"]
56
  DATASET_NAME = "jobeasz"
57
 
 
 
58
  @st.cache_data(ttl=3600)
59
  def load_and_concat_data():
60
  api = HfApi()
 
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
 
69
+ # Use CSV sniffer to detect delimiter and number of columns
70
+ with open(file_content, 'r') as f:
71
+ sample = f.read(1024)
72
+ sniffer = csv.Sniffer()
73
+ dialect = sniffer.sniff(sample)
74
+ f.seek(0)
75
+
76
+ # Use pyarrow for more flexible parsing
77
+ parse_options = csv.ParseOptions(delimiter=dialect.delimiter)
78
+ table = csv.read_csv(file_content, parse_options=parse_options)
79
+
80
+ # Convert to pandas DataFrame
81
+ df = table.to_pandas()
82
+
83
+ # Ensure all required columns are present, fill with NaN if missing
84
+ required_columns = ['site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url']
85
+ for col in required_columns:
86
+ if col not in df.columns:
87
+ df[col] = pd.NA
88
 
89
+ # Select only the required columns
90
+ df = df[required_columns]
91
+
92
+ all_data.append(df)
93
  except Exception as e:
94
  print(f"Error processing file {file}: {str(e)}")
95
+ continue
96
 
97
  if not all_data:
98
+ return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ concatenated_df = pd.concat(all_data, ignore_index=True)
101
+
102
+ # Perform data cleaning and processing
103
+ concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'], errors='coerce')
104
+ concatenated_df = concatenated_df.dropna(subset=['date_posted'])
105
+ concatenated_df = concatenated_df[concatenated_df['date_posted'].dt.year == 2024]
106
+ concatenated_df['title'] = concatenated_df['title'].str.lower()
107
+ concatenated_df['company'] = concatenated_df['company'].str.lower()
108
+
109
  def clean_location(location):
110
+ if pd.isna(location):
111
+ return location
112
+ location = str(location).lower()
113
  return re.sub(r',\s*(us|usa)$', '', location)
114
 
115
+ concatenated_df['location'] = concatenated_df['location'].apply(clean_location)
116
+ concatenated_df = concatenated_df.drop_duplicates()
117
+
118
+ return concatenated_df
 
 
 
 
 
 
 
 
 
 
119
 
120
  @st.cache_data()
121
  def get_unique_values(df):