Spaces:
Running
Running
Niharmahesh
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -55,8 +55,6 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
|
|
55 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
56 |
DATASET_NAME = "jobeasz"
|
57 |
|
58 |
-
|
59 |
-
|
60 |
@st.cache_data(ttl=3600)
|
61 |
def load_and_concat_data():
|
62 |
api = HfApi()
|
@@ -68,64 +66,56 @@ def load_and_concat_data():
|
|
68 |
try:
|
69 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
70 |
|
71 |
-
# Use
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
except Exception as e:
|
84 |
print(f"Error processing file {file}: {str(e)}")
|
|
|
85 |
|
86 |
if not all_data:
|
87 |
-
return
|
88 |
-
|
89 |
-
# Concatenate all tables
|
90 |
-
concatenated_table = pa.concat_tables(all_data)
|
91 |
-
|
92 |
-
# Filter for 2024 data
|
93 |
-
mask = pc.year(concatenated_table['date_posted']) == 2024
|
94 |
-
filtered_table = concatenated_table.filter(mask)
|
95 |
-
|
96 |
-
# Convert titles and company names to lowercase
|
97 |
-
filtered_table = filtered_table.set_column(
|
98 |
-
filtered_table.schema.get_field_index('title'),
|
99 |
-
'title',
|
100 |
-
pc.utf8_lower(filtered_table['title'])
|
101 |
-
)
|
102 |
-
filtered_table = filtered_table.set_column(
|
103 |
-
filtered_table.schema.get_field_index('company'),
|
104 |
-
'company',
|
105 |
-
pc.utf8_lower(filtered_table['company'])
|
106 |
-
)
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
def clean_location(location):
|
110 |
-
if location
|
111 |
-
return
|
112 |
-
location = location.lower()
|
113 |
return re.sub(r',\s*(us|usa)$', '', location)
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
cleaned_locations
|
120 |
-
)
|
121 |
-
|
122 |
-
# Remove duplicates
|
123 |
-
filtered_table = filtered_table.group_by(filtered_table.column_names).aggregate([])
|
124 |
-
|
125 |
-
# Convert to pandas DataFrame for compatibility with the rest of your code
|
126 |
-
filtered_df = filtered_table.to_pandas()
|
127 |
-
|
128 |
-
return filtered_df
|
129 |
|
130 |
@st.cache_data()
|
131 |
def get_unique_values(df):
|
|
|
55 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
56 |
DATASET_NAME = "jobeasz"
|
57 |
|
|
|
|
|
58 |
@st.cache_data(ttl=3600)
|
59 |
def load_and_concat_data():
|
60 |
api = HfApi()
|
|
|
66 |
try:
|
67 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
68 |
|
69 |
+
# Use CSV sniffer to detect delimiter and number of columns
|
70 |
+
with open(file_content, 'r') as f:
|
71 |
+
sample = f.read(1024)
|
72 |
+
sniffer = csv.Sniffer()
|
73 |
+
dialect = sniffer.sniff(sample)
|
74 |
+
f.seek(0)
|
75 |
+
|
76 |
+
# Use pyarrow for more flexible parsing
|
77 |
+
parse_options = csv.ParseOptions(delimiter=dialect.delimiter)
|
78 |
+
table = csv.read_csv(file_content, parse_options=parse_options)
|
79 |
+
|
80 |
+
# Convert to pandas DataFrame
|
81 |
+
df = table.to_pandas()
|
82 |
+
|
83 |
+
# Ensure all required columns are present, fill with NaN if missing
|
84 |
+
required_columns = ['site', 'job_url', 'title', 'company', 'location', 'job_type', 'date_posted', 'is_remote', 'company_url']
|
85 |
+
for col in required_columns:
|
86 |
+
if col not in df.columns:
|
87 |
+
df[col] = pd.NA
|
88 |
|
89 |
+
# Select only the required columns
|
90 |
+
df = df[required_columns]
|
91 |
+
|
92 |
+
all_data.append(df)
|
93 |
except Exception as e:
|
94 |
print(f"Error processing file {file}: {str(e)}")
|
95 |
+
continue
|
96 |
|
97 |
if not all_data:
|
98 |
+
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
concatenated_df = pd.concat(all_data, ignore_index=True)
|
101 |
+
|
102 |
+
# Perform data cleaning and processing
|
103 |
+
concatenated_df['date_posted'] = pd.to_datetime(concatenated_df['date_posted'], errors='coerce')
|
104 |
+
concatenated_df = concatenated_df.dropna(subset=['date_posted'])
|
105 |
+
concatenated_df = concatenated_df[concatenated_df['date_posted'].dt.year == 2024]
|
106 |
+
concatenated_df['title'] = concatenated_df['title'].str.lower()
|
107 |
+
concatenated_df['company'] = concatenated_df['company'].str.lower()
|
108 |
+
|
109 |
def clean_location(location):
|
110 |
+
if pd.isna(location):
|
111 |
+
return location
|
112 |
+
location = str(location).lower()
|
113 |
return re.sub(r',\s*(us|usa)$', '', location)
|
114 |
|
115 |
+
concatenated_df['location'] = concatenated_df['location'].apply(clean_location)
|
116 |
+
concatenated_df = concatenated_df.drop_duplicates()
|
117 |
+
|
118 |
+
return concatenated_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
@st.cache_data()
|
121 |
def get_unique_values(df):
|