Niharmahesh commited on
Commit
80987ab
·
verified ·
1 Parent(s): 481fd0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -55,17 +55,19 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
55
  HF_USERNAME = st.secrets["HF_USERNAME"]
56
  DATASET_NAME = "jobeasz"
57
 
 
 
58
  @st.cache_data(ttl=3600)
59
  def load_and_concat_data():
60
  api = HfApi()
61
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
62
- csv_files = [file for file in dataset_files if file.endswith('.csv')]
63
 
64
  all_data = []
65
- for file in csv_files:
66
  try:
67
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
68
- df = pd.read_csv(file_content, engine='pyarrow')
69
  all_data.append(df)
70
  except Exception:
71
  pass # Silently skip files that can't be processed
@@ -102,7 +104,7 @@ def load_and_concat_data():
102
 
103
  # Clean the location in place
104
  filtered_df['location'] = filtered_df['location'].apply(clean_location)
105
- #added new line to drop duplciate records
106
  filtered_df = filtered_df.drop_duplicates()
107
 
108
  return filtered_df
 
55
  HF_USERNAME = st.secrets["HF_USERNAME"]
56
  DATASET_NAME = "jobeasz"
57
 
58
+ import pyarrow.feather as feather
59
+
60
  @st.cache_data(ttl=3600)
61
  def load_and_concat_data():
62
  api = HfApi()
63
  dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
64
+ feather_files = [file for file in dataset_files if file.endswith('.feather')]
65
 
66
  all_data = []
67
+ for file in feather_files:
68
  try:
69
  file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
70
+ df = feather.read_feather(file_content)
71
  all_data.append(df)
72
  except Exception:
73
  pass # Silently skip files that can't be processed
 
104
 
105
  # Clean the location in place
106
  filtered_df['location'] = filtered_df['location'].apply(clean_location)
107
+ #added new line to drop duplicate records
108
  filtered_df = filtered_df.drop_duplicates()
109
 
110
  return filtered_df