Niharmahesh commited on
Commit
481fd0c
·
verified ·
1 Parent(s): ad3a72f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -35
app.py CHANGED
@@ -65,47 +65,46 @@ def load_and_concat_data():
65
  for file in csv_files:
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
-
69
- # Use PyArrow's CSV reading capabilities
70
- read_options = csv.ReadOptions(use_threads=True)
71
- parse_options = csv.ParseOptions(delimiter=',') # Adjust delimiter if needed
72
- convert_options = csv.ConvertOptions(
73
- column_types={
74
- 'date_posted': pa.timestamp('s'),
75
- 'is_remote': pa.bool_()
76
- },
77
- strings_can_be_null=True
78
- )
79
-
80
- table = csv.read_csv(file_content, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
81
- df = table.to_pandas()
82
-
83
- # Perform data cleaning and processing
84
- df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')
85
- df = df.dropna(subset=['date_posted'])
86
- df = df[df['date_posted'].dt.year == 2024]
87
- df['title'] = df['title'].str.lower()
88
- df['company'] = df['company'].str.lower()
89
-
90
- def clean_location(location):
91
- if pd.isna(location):
92
- return location
93
- location = str(location).lower()
94
- return re.sub(r',\s*(us|usa)$', '', location)
95
-
96
- df['location'] = df['location'].apply(clean_location)
97
-
98
  all_data.append(df)
99
- except Exception as e:
100
- print(f"Error processing file {file}: {str(e)}")
101
- continue
102
 
103
  if not all_data:
104
  return pd.DataFrame()
105
 
106
  concatenated_df = pd.concat(all_data, ignore_index=True)
107
- filtered_df = concatenated_df.drop_duplicates().reset_index(drop=True)
108
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  return filtered_df
110
 
111
  @st.cache_data()
 
65
  for file in csv_files:
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
+ df = pd.read_csv(file_content, engine='pyarrow')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  all_data.append(df)
70
+ except Exception:
71
+ pass # Silently skip files that can't be processed
 
72
 
73
  if not all_data:
74
  return pd.DataFrame()
75
 
76
  concatenated_df = pd.concat(all_data, ignore_index=True)
77
+
78
+ columns_to_keep = [
79
+ 'site', 'job_url', 'title', 'company', 'location',
80
+ 'job_type', 'date_posted', 'is_remote', 'company_url'
81
+ ]
82
+ filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
83
+ filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
84
+
85
+ # Drop duplicates and rows with NaT in date_posted
86
+ filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
87
+ #filtering based on data in 2024
88
+ filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
89
+ # Convert titles and company name to lowercase
90
+ filtered_df['title'] = filtered_df['title'].str.lower()
91
+ filtered_df['company'] = filtered_df['company'].str.lower()
92
+
93
+ # Function to clean the location
94
+ def clean_location(location):
95
+ if pd.isna(location):
96
+ return location # Return NaN as is
97
+ # Convert to lowercase
98
+ location = location.lower()
99
+ # Remove ', us' or ', usa' from the end using regex
100
+ location = re.sub(r',\s*(us|usa)$', '', location)
101
+ return location
102
+
103
+ # Clean the location in place
104
+ filtered_df['location'] = filtered_df['location'].apply(clean_location)
105
+ #added new line to drop duplciate records
106
+ filtered_df = filtered_df.drop_duplicates()
107
+
108
  return filtered_df
109
 
110
  @st.cache_data()