Niharmahesh commited on
Commit
838169f
·
verified ·
1 Parent(s): c5bab47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -31
app.py CHANGED
@@ -55,50 +55,57 @@ HF_USERNAME = st.secrets["HF_USERNAME"]
55
  DATASET_NAME = "jobeasz"
56
 
57
 
 
58
  @st.cache_data(ttl=3600)
59
  def load_and_concat_data():
60
  api = HfApi()
61
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
62
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
63
 
64
- def process_file(file):
 
65
  try:
66
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
67
- df = pd.read_csv(file_content, engine='pyarrow', usecols=[
68
- 'site', 'job_url', 'title', 'company', 'location',
69
- 'job_type', 'date_posted', 'is_remote', 'company_url'
70
- ])
71
- df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')
72
- df = df[df['date_posted'].dt.year == 2024].dropna(subset=['date_posted'])
73
- df['title'] = df['title'].str.lower()
74
- df['company'] = df['company'].str.lower()
75
- df['location'] = df['location'].apply(clean_location)
76
- return df
77
- except Exception as e:
78
- print(f"Error processing file {file}: {str(e)}")
79
- return None
80
-
81
- def clean_location(location):
82
- if pd.isna(location):
83
- return location
84
- location = location.lower()
85
- return re.sub(r',\s*(us|usa)$', '', location)
86
-
87
- # Use ThreadPoolExecutor for parallel processing
88
- with ThreadPoolExecutor(max_workers=4) as executor:
89
- future_to_file = {executor.submit(process_file, file): file for file in csv_files}
90
- all_data = []
91
- for future in as_completed(future_to_file):
92
- df = future.result()
93
- if df is not None:
94
- all_data.append(df)
95
 
96
  if not all_data:
97
  return pd.DataFrame()
98
 
99
  concatenated_df = pd.concat(all_data, ignore_index=True)
100
- filtered_df = concatenated_df.drop_duplicates().reset_index(drop=True)
101
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  return filtered_df
103
 
104
  @st.cache_data()
 
55
  DATASET_NAME = "jobeasz"
56
 
57
 
58
+
59
  @st.cache_data(ttl=3600)
60
  def load_and_concat_data():
61
  api = HfApi()
62
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
63
  csv_files = [file for file in dataset_files if file.endswith('.csv')]
64
 
65
+ all_data = []
66
+ for file in csv_files:
67
  try:
68
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
69
+ df = pd.read_csv(file_content, engine='pyarrow')
70
+ all_data.append(df)
71
+ except Exception:
72
+ pass # Silently skip files that can't be processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  if not all_data:
75
  return pd.DataFrame()
76
 
77
  concatenated_df = pd.concat(all_data, ignore_index=True)
78
+
79
+ columns_to_keep = [
80
+ 'site', 'job_url', 'title', 'company', 'location',
81
+ 'job_type', 'date_posted', 'is_remote', 'company_url'
82
+ ]
83
+ filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
84
+ filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
85
+
86
+ # Drop duplicates and rows with NaT in date_posted
87
+ filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
88
+ #filtering based on data in 2024
89
+ filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
90
+ # Convert titles and company name to lowercase
91
+ filtered_df['title'] = filtered_df['title'].str.lower()
92
+ filtered_df['company'] = filtered_df['company'].str.lower()
93
+
94
+ # Function to clean the location
95
+ def clean_location(location):
96
+ if pd.isna(location):
97
+ return location # Return NaN as is
98
+ # Convert to lowercase
99
+ location = location.lower()
100
+ # Remove ', us' or ', usa' from the end using regex
101
+ location = re.sub(r',\s*(us|usa)$', '', location)
102
+ return location
103
+
104
+ # Clean the location in place
105
+ filtered_df['location'] = filtered_df['location'].apply(clean_location)
106
+ #added new line to drop duplciate records
107
+ filtered_df = filtered_df.drop_duplicates()
108
+
109
  return filtered_df
110
 
111
  @st.cache_data()