Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -55,17 +55,19 @@ HF_TOKEN = st.secrets["HF_TOKEN"]
|
|
55 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
56 |
DATASET_NAME = "jobeasz"
|
57 |
|
|
|
|
|
58 |
@st.cache_data(ttl=3600)
|
59 |
def load_and_concat_data():
|
60 |
api = HfApi()
|
61 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
62 |
-
|
63 |
|
64 |
all_data = []
|
65 |
-
for file in
|
66 |
try:
|
67 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
68 |
-
df =
|
69 |
all_data.append(df)
|
70 |
except Exception:
|
71 |
pass # Silently skip files that can't be processed
|
@@ -102,7 +104,7 @@ def load_and_concat_data():
|
|
102 |
|
103 |
# Clean the location in place
|
104 |
filtered_df['location'] = filtered_df['location'].apply(clean_location)
|
105 |
-
#added new line to drop
|
106 |
filtered_df = filtered_df.drop_duplicates()
|
107 |
|
108 |
return filtered_df
|
|
|
55 |
HF_USERNAME = st.secrets["HF_USERNAME"]
|
56 |
DATASET_NAME = "jobeasz"
|
57 |
|
58 |
+
import pyarrow.feather as feather
|
59 |
+
|
60 |
@st.cache_data(ttl=3600)
|
61 |
def load_and_concat_data():
|
62 |
api = HfApi()
|
63 |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
64 |
+
feather_files = [file for file in dataset_files if file.endswith('.feather')]
|
65 |
|
66 |
all_data = []
|
67 |
+
for file in feather_files:
|
68 |
try:
|
69 |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
70 |
+
df = feather.read_feather(file_content)
|
71 |
all_data.append(df)
|
72 |
except Exception:
|
73 |
pass # Silently skip files that can't be processed
|
|
|
104 |
|
105 |
# Clean the location in place
|
106 |
filtered_df['location'] = filtered_df['location'].apply(clean_location)
|
107 |
+
#added new line to drop duplicate records
|
108 |
filtered_df = filtered_df.drop_duplicates()
|
109 |
|
110 |
return filtered_df
|