Spaces:
Sleeping
Sleeping
Niharmahesh
commited on
Delete pages/test.py
Browse files- pages/test.py +0 -236
pages/test.py
DELETED
@@ -1,236 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
-
from scipy.stats import pearsonr
|
6 |
-
from scipy.spatial.distance import euclidean
|
7 |
-
from sentence_transformers import SentenceTransformer
|
8 |
-
import groq
|
9 |
-
import math
|
10 |
-
import json
|
11 |
-
from huggingface_hub import HfApi
|
12 |
-
import io
|
13 |
-
from pdfminer.high_level import extract_text
|
14 |
-
import pyarrow.feather as feather
|
15 |
-
import re
|
16 |
-
from datetime import datetime, timedelta
|
17 |
-
|
18 |
-
HF_TOKEN = st.secrets["HF_TOKEN"]
|
19 |
-
HF_USERNAME = st.secrets["HF_USERNAME"]
|
20 |
-
DATASET_NAME = "jobeasz"
|
21 |
-
|
22 |
-
@st.cache_data(ttl=3600)
|
23 |
-
def load_and_concat_data():
|
24 |
-
api = HfApi()
|
25 |
-
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
|
26 |
-
feather_files = [file for file in dataset_files if file.endswith('.feather')]
|
27 |
-
|
28 |
-
all_data = []
|
29 |
-
for file in feather_files:
|
30 |
-
try:
|
31 |
-
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
|
32 |
-
df = feather.read_feather(file_content)
|
33 |
-
all_data.append(df)
|
34 |
-
except Exception:
|
35 |
-
pass # Silently skip files that can't be processed
|
36 |
-
|
37 |
-
if not all_data:
|
38 |
-
return pd.DataFrame()
|
39 |
-
|
40 |
-
concatenated_df = pd.concat(all_data, ignore_index=True)
|
41 |
-
|
42 |
-
columns_to_keep = [
|
43 |
-
'site', 'job_url', 'title', 'company', 'location',
|
44 |
-
'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
|
45 |
-
]
|
46 |
-
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
|
47 |
-
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
|
48 |
-
|
49 |
-
# Drop duplicates and rows with NaT in date_posted
|
50 |
-
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
|
51 |
-
#filtering based on data in 2024
|
52 |
-
filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
|
53 |
-
# Convert titles and company name to lowercase
|
54 |
-
filtered_df['title'] = filtered_df['title'].str.lower()
|
55 |
-
filtered_df['company'] = filtered_df['company'].str.lower()
|
56 |
-
|
57 |
-
# Function to clean the location
|
58 |
-
def clean_location(location):
|
59 |
-
if pd.isna(location):
|
60 |
-
return location # Return NaN as is
|
61 |
-
# Convert to lowercase
|
62 |
-
location = location.lower()
|
63 |
-
# Remove ', us' or ', usa' from the end using regex
|
64 |
-
location = re.sub(r',\s*(us|usa)$', '', location)
|
65 |
-
return location
|
66 |
-
|
67 |
-
# Clean the location in place
|
68 |
-
filtered_df['location'] = filtered_df['location'].apply(clean_location)
|
69 |
-
#added new line to drop duplicate records
|
70 |
-
filtered_df = filtered_df.drop_duplicates()
|
71 |
-
|
72 |
-
return filtered_df
|
73 |
-
|
74 |
-
def remove_special_chars(text):
|
75 |
-
if pd.isna(text):
|
76 |
-
return text
|
77 |
-
# Remove special characters and markdown formatting
|
78 |
-
cleaned_text = re.sub(r'[*\n\-_]', ' ', text)
|
79 |
-
# Remove extra whitespace
|
80 |
-
cleaned_text = ' '.join(cleaned_text.split())
|
81 |
-
return cleaned_text
|
82 |
-
|
83 |
-
@st.cache_resource
|
84 |
-
def load_models():
|
85 |
-
return {
|
86 |
-
'minilm': SentenceTransformer('all-MiniLM-L6-v2'),
|
87 |
-
'mpnet': SentenceTransformer('all-mpnet-base-v2'),
|
88 |
-
'paraphrase': SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
89 |
-
}
|
90 |
-
|
91 |
-
def generate_embeddings(text, models):
|
92 |
-
return {
|
93 |
-
'minilm': models['minilm'].encode(text),
|
94 |
-
'mpnet': models['mpnet'].encode(text),
|
95 |
-
'paraphrase': models['paraphrase'].encode(text)
|
96 |
-
}
|
97 |
-
|
98 |
-
def calculate_similarities(job_embeddings, resume_embedding):
|
99 |
-
similarities = []
|
100 |
-
for job_embedding in job_embeddings:
|
101 |
-
job_emb = np.array(job_embedding).reshape(1, -1)
|
102 |
-
res_emb = resume_embedding.reshape(1, -1)
|
103 |
-
cosine_sim = cosine_similarity(job_emb, res_emb)[0][0]
|
104 |
-
pearson_corr = pearsonr(job_embedding, resume_embedding)[0]
|
105 |
-
euclidean_dist = euclidean(job_embedding, resume_embedding)
|
106 |
-
similarities.append({
|
107 |
-
'cosine': cosine_sim,
|
108 |
-
'pearson': pearson_corr,
|
109 |
-
'euclidean': euclidean_dist
|
110 |
-
})
|
111 |
-
return similarities
|
112 |
-
|
113 |
-
def get_top_matches(df, n=50):
|
114 |
-
top_matches = pd.DataFrame()
|
115 |
-
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
116 |
-
for metric in ['cosine', 'pearson', 'euclidean']:
|
117 |
-
col_name = f'{model_name}_{metric}'
|
118 |
-
ascending = metric == 'euclidean'
|
119 |
-
top_n = df.nsmallest(n, col_name) if ascending else df.nlargest(n, col_name)
|
120 |
-
top_n['model'] = model_name
|
121 |
-
top_n['metric'] = metric
|
122 |
-
top_matches = pd.concat([top_matches, top_n])
|
123 |
-
return top_matches.drop_duplicates().head(150)
|
124 |
-
|
125 |
-
@st.cache_data
|
126 |
-
def evaluate_with_groq(resume_text, job_description_text, client):
|
127 |
-
prompt = f"""
|
128 |
-
Resume: {resume_text}
|
129 |
-
Job Description: {job_description_text}
|
130 |
-
Based on the above information, rate the match quality on a scale of 0-100 and provide reasoning.
|
131 |
-
Return your response in the following JSON format:
|
132 |
-
{{ "score": <integer between 0 and 100>, "reasoning": "<your explanation>" }}
|
133 |
-
"""
|
134 |
-
response = client.chat.completions.create(
|
135 |
-
messages=[
|
136 |
-
{"role": "user", "content": prompt}
|
137 |
-
],
|
138 |
-
model="mixtral-8x7b-32768",
|
139 |
-
max_tokens=200,
|
140 |
-
)
|
141 |
-
return json.loads(response.choices[0].message.content)
|
142 |
-
|
143 |
-
def display_data_explorer(df):
|
144 |
-
st.subheader("Data Explorer")
|
145 |
-
items_per_page = 15
|
146 |
-
num_pages = math.ceil(len(df) / items_per_page)
|
147 |
-
col1, col2, col3 = st.columns([1, 3, 1])
|
148 |
-
with col2:
|
149 |
-
page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
|
150 |
-
start_idx = (page - 1) * items_per_page
|
151 |
-
end_idx = start_idx + items_per_page
|
152 |
-
page_df = df.iloc[start_idx:end_idx]
|
153 |
-
|
154 |
-
def make_clickable(url, text):
|
155 |
-
return f'<a href="{url}" target="_blank" style="color: #4e79a7;">{text}</a>'
|
156 |
-
|
157 |
-
page_df['job_url'] = page_df.apply(lambda row: make_clickable(row['job_url'], 'Link'), axis=1)
|
158 |
-
page_df['company_url'] = page_df.apply(lambda row: make_clickable(row['company_url'], row['company']), axis=1)
|
159 |
-
|
160 |
-
display_columns = ['title', 'company_url', 'location', 'job_type', 'date_posted', 'job_url', 'groq_score', 'groq_reasoning']
|
161 |
-
st.write(page_df[display_columns].to_html(escape=False, index=False), unsafe_allow_html=True)
|
162 |
-
|
163 |
-
col1, col2, col3 = st.columns([1, 3, 1])
|
164 |
-
with col2:
|
165 |
-
st.write(f"Page {page} of {num_pages}")
|
166 |
-
|
167 |
-
def read_file_content(uploaded_file):
|
168 |
-
if uploaded_file.type == "application/pdf":
|
169 |
-
pdf_reader = io.BytesIO(uploaded_file.getvalue())
|
170 |
-
return extract_text(pdf_reader)
|
171 |
-
else:
|
172 |
-
return uploaded_file.getvalue().decode("utf-8", errors="ignore")
|
173 |
-
|
174 |
-
def main():
|
175 |
-
st.title("Resume-Job Matcher")
|
176 |
-
|
177 |
-
# Load data
|
178 |
-
df = load_and_concat_data()
|
179 |
-
|
180 |
-
# Filter data for the latest 3 days
|
181 |
-
current_date = datetime.now().date()
|
182 |
-
date_3_days_ago = current_date - timedelta(days=3)
|
183 |
-
df['date'] = df['date_posted'].dt.date
|
184 |
-
df_filtered = df[df['date'] >= date_3_days_ago]
|
185 |
-
|
186 |
-
# Print count of records for each day
|
187 |
-
for date in [current_date, current_date - timedelta(days=1), current_date - timedelta(days=2)]:
|
188 |
-
count = df_filtered[df_filtered['date'] == date].shape[0]
|
189 |
-
st.write(f"Records for {date}: {count}")
|
190 |
-
|
191 |
-
# Clean description and create embeddings
|
192 |
-
models = load_models()
|
193 |
-
df_filtered['cleaned_description'] = df_filtered['description'].apply(remove_special_chars)
|
194 |
-
|
195 |
-
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
196 |
-
df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
|
197 |
-
|
198 |
-
uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
|
199 |
-
if uploaded_file is not None:
|
200 |
-
try:
|
201 |
-
resume_text = read_file_content(uploaded_file)
|
202 |
-
cleaned_resume = remove_special_chars(resume_text)
|
203 |
-
st.subheader("Parsed Resume")
|
204 |
-
st.text(cleaned_resume)
|
205 |
-
|
206 |
-
resume_embeddings = generate_embeddings(cleaned_resume, models)
|
207 |
-
|
208 |
-
for model_name in ['minilm', 'mpnet', 'paraphrase']:
|
209 |
-
similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
|
210 |
-
for metric in ['cosine', 'pearson', 'euclidean']:
|
211 |
-
df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
|
212 |
-
|
213 |
-
top_matches = get_top_matches(df_filtered, 50)
|
214 |
-
st.subheader("Top 150 Matches (Before Groq Evaluation)")
|
215 |
-
st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
|
216 |
-
|
217 |
-
groq_api_key = st.text_input("Enter your Groq API Key", type="password")
|
218 |
-
if groq_api_key:
|
219 |
-
client = groq.Groq(api_key=groq_api_key)
|
220 |
-
st.subheader("Evaluating matches with Groq...")
|
221 |
-
progress_bar = st.progress(0)
|
222 |
-
for i, row in enumerate(top_matches.itertuples()):
|
223 |
-
groq_result = evaluate_with_groq(cleaned_resume, row.description, client)
|
224 |
-
top_matches.at[row.Index, 'groq_score'] = groq_result['score']
|
225 |
-
top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
|
226 |
-
progress_bar.progress((i + 1) / len(top_matches))
|
227 |
-
|
228 |
-
top_100_matches = top_matches.nlargest(100, 'groq_score')
|
229 |
-
st.subheader("Top 100 Matches After Groq Evaluation")
|
230 |
-
display_data_explorer(top_100_matches)
|
231 |
-
|
232 |
-
except Exception as e:
|
233 |
-
st.error(f"An error occurred while processing the file: {str(e)}")
|
234 |
-
|
235 |
-
if __name__ == "__main__":
|
236 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|