Spaces:

Niharmahesh
/

job_easz

Running

Niharmahesh commited on Nov 21, 2024

Commit

80987ab

verified ·

1 Parent(s): 481fd0c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -55,17 +55,19 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
 @st.cache_data(ttl=3600)
 def load_and_concat_data():
     api = HfApi()
     dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
-    csv_files = [file for file in dataset_files if file.endswith('.csv')]
     all_data = []
-    for file in csv_files:
         try:
             file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
-            df = pd.read_csv(file_content, engine='pyarrow')
             all_data.append(df)
         except Exception:
             pass  # Silently skip files that can't be processed
@@ -102,7 +104,7 @@ def load_and_concat_data():
     # Clean the location in place
     filtered_df['location'] = filtered_df['location'].apply(clean_location)
-    #added new line to drop duplciate records
     filtered_df = filtered_df.drop_duplicates()
     return filtered_df

 HF_USERNAME = st.secrets["HF_USERNAME"]
 DATASET_NAME = "jobeasz"
+import pyarrow.feather as feather
 @st.cache_data(ttl=3600)
 def load_and_concat_data():
     api = HfApi()
     dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
+    feather_files = [file for file in dataset_files if file.endswith('.feather')]
     all_data = []
+    for file in feather_files:
         try:
             file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
+            df = feather.read_feather(file_content)
             all_data.append(df)
         except Exception:
             pass  # Silently skip files that can't be processed
     # Clean the location in place
     filtered_df['location'] = filtered_df['location'].apply(clean_location)
+    #added new line to drop duplicate records
     filtered_df = filtered_df.drop_duplicates()
     return filtered_df