Niharmahesh commited on
Commit
31cc020
·
verified ·
1 Parent(s): 0aa18c3

Delete pages/test.py

Browse files
Files changed (1) hide show
  1. pages/test.py +0 -236
pages/test.py DELETED
@@ -1,236 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- from scipy.stats import pearsonr
6
- from scipy.spatial.distance import euclidean
7
- from sentence_transformers import SentenceTransformer
8
- import groq
9
- import math
10
- import json
11
- from huggingface_hub import HfApi
12
- import io
13
- from pdfminer.high_level import extract_text
14
- import pyarrow.feather as feather
15
- import re
16
- from datetime import datetime, timedelta
17
-
18
- HF_TOKEN = st.secrets["HF_TOKEN"]
19
- HF_USERNAME = st.secrets["HF_USERNAME"]
20
- DATASET_NAME = "jobeasz"
21
-
22
- @st.cache_data(ttl=3600)
23
- def load_and_concat_data():
24
- api = HfApi()
25
- dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset")
26
- feather_files = [file for file in dataset_files if file.endswith('.feather')]
27
-
28
- all_data = []
29
- for file in feather_files:
30
- try:
31
- file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN)
32
- df = feather.read_feather(file_content)
33
- all_data.append(df)
34
- except Exception:
35
- pass # Silently skip files that can't be processed
36
-
37
- if not all_data:
38
- return pd.DataFrame()
39
-
40
- concatenated_df = pd.concat(all_data, ignore_index=True)
41
-
42
- columns_to_keep = [
43
- 'site', 'job_url', 'title', 'company', 'location',
44
- 'job_type', 'date_posted', 'is_remote', 'company_url', 'description'
45
- ]
46
- filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True)
47
- filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce')
48
-
49
- # Drop duplicates and rows with NaT in date_posted
50
- filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted'])
51
- #filtering based on data in 2024
52
- filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024]
53
- # Convert titles and company name to lowercase
54
- filtered_df['title'] = filtered_df['title'].str.lower()
55
- filtered_df['company'] = filtered_df['company'].str.lower()
56
-
57
- # Function to clean the location
58
- def clean_location(location):
59
- if pd.isna(location):
60
- return location # Return NaN as is
61
- # Convert to lowercase
62
- location = location.lower()
63
- # Remove ', us' or ', usa' from the end using regex
64
- location = re.sub(r',\s*(us|usa)$', '', location)
65
- return location
66
-
67
- # Clean the location in place
68
- filtered_df['location'] = filtered_df['location'].apply(clean_location)
69
- #added new line to drop duplicate records
70
- filtered_df = filtered_df.drop_duplicates()
71
-
72
- return filtered_df
73
-
74
- def remove_special_chars(text):
75
- if pd.isna(text):
76
- return text
77
- # Remove special characters and markdown formatting
78
- cleaned_text = re.sub(r'[*\n\-_]', ' ', text)
79
- # Remove extra whitespace
80
- cleaned_text = ' '.join(cleaned_text.split())
81
- return cleaned_text
82
-
83
- @st.cache_resource
84
- def load_models():
85
- return {
86
- 'minilm': SentenceTransformer('all-MiniLM-L6-v2'),
87
- 'mpnet': SentenceTransformer('all-mpnet-base-v2'),
88
- 'paraphrase': SentenceTransformer('paraphrase-MiniLM-L6-v2')
89
- }
90
-
91
- def generate_embeddings(text, models):
92
- return {
93
- 'minilm': models['minilm'].encode(text),
94
- 'mpnet': models['mpnet'].encode(text),
95
- 'paraphrase': models['paraphrase'].encode(text)
96
- }
97
-
98
- def calculate_similarities(job_embeddings, resume_embedding):
99
- similarities = []
100
- for job_embedding in job_embeddings:
101
- job_emb = np.array(job_embedding).reshape(1, -1)
102
- res_emb = resume_embedding.reshape(1, -1)
103
- cosine_sim = cosine_similarity(job_emb, res_emb)[0][0]
104
- pearson_corr = pearsonr(job_embedding, resume_embedding)[0]
105
- euclidean_dist = euclidean(job_embedding, resume_embedding)
106
- similarities.append({
107
- 'cosine': cosine_sim,
108
- 'pearson': pearson_corr,
109
- 'euclidean': euclidean_dist
110
- })
111
- return similarities
112
-
113
- def get_top_matches(df, n=50):
114
- top_matches = pd.DataFrame()
115
- for model_name in ['minilm', 'mpnet', 'paraphrase']:
116
- for metric in ['cosine', 'pearson', 'euclidean']:
117
- col_name = f'{model_name}_{metric}'
118
- ascending = metric == 'euclidean'
119
- top_n = df.nsmallest(n, col_name) if ascending else df.nlargest(n, col_name)
120
- top_n['model'] = model_name
121
- top_n['metric'] = metric
122
- top_matches = pd.concat([top_matches, top_n])
123
- return top_matches.drop_duplicates().head(150)
124
-
125
- @st.cache_data
126
- def evaluate_with_groq(resume_text, job_description_text, client):
127
- prompt = f"""
128
- Resume: {resume_text}
129
- Job Description: {job_description_text}
130
- Based on the above information, rate the match quality on a scale of 0-100 and provide reasoning.
131
- Return your response in the following JSON format:
132
- {{ "score": <integer between 0 and 100>, "reasoning": "<your explanation>" }}
133
- """
134
- response = client.chat.completions.create(
135
- messages=[
136
- {"role": "user", "content": prompt}
137
- ],
138
- model="mixtral-8x7b-32768",
139
- max_tokens=200,
140
- )
141
- return json.loads(response.choices[0].message.content)
142
-
143
- def display_data_explorer(df):
144
- st.subheader("Data Explorer")
145
- items_per_page = 15
146
- num_pages = math.ceil(len(df) / items_per_page)
147
- col1, col2, col3 = st.columns([1, 3, 1])
148
- with col2:
149
- page = st.number_input("Page", min_value=1, max_value=num_pages, value=1)
150
- start_idx = (page - 1) * items_per_page
151
- end_idx = start_idx + items_per_page
152
- page_df = df.iloc[start_idx:end_idx]
153
-
154
- def make_clickable(url, text):
155
- return f'<a href="{url}" target="_blank" style="color: #4e79a7;">{text}</a>'
156
-
157
- page_df['job_url'] = page_df.apply(lambda row: make_clickable(row['job_url'], 'Link'), axis=1)
158
- page_df['company_url'] = page_df.apply(lambda row: make_clickable(row['company_url'], row['company']), axis=1)
159
-
160
- display_columns = ['title', 'company_url', 'location', 'job_type', 'date_posted', 'job_url', 'groq_score', 'groq_reasoning']
161
- st.write(page_df[display_columns].to_html(escape=False, index=False), unsafe_allow_html=True)
162
-
163
- col1, col2, col3 = st.columns([1, 3, 1])
164
- with col2:
165
- st.write(f"Page {page} of {num_pages}")
166
-
167
- def read_file_content(uploaded_file):
168
- if uploaded_file.type == "application/pdf":
169
- pdf_reader = io.BytesIO(uploaded_file.getvalue())
170
- return extract_text(pdf_reader)
171
- else:
172
- return uploaded_file.getvalue().decode("utf-8", errors="ignore")
173
-
174
- def main():
175
- st.title("Resume-Job Matcher")
176
-
177
- # Load data
178
- df = load_and_concat_data()
179
-
180
- # Filter data for the latest 3 days
181
- current_date = datetime.now().date()
182
- date_3_days_ago = current_date - timedelta(days=3)
183
- df['date'] = df['date_posted'].dt.date
184
- df_filtered = df[df['date'] >= date_3_days_ago]
185
-
186
- # Print count of records for each day
187
- for date in [current_date, current_date - timedelta(days=1), current_date - timedelta(days=2)]:
188
- count = df_filtered[df_filtered['date'] == date].shape[0]
189
- st.write(f"Records for {date}: {count}")
190
-
191
- # Clean description and create embeddings
192
- models = load_models()
193
- df_filtered['cleaned_description'] = df_filtered['description'].apply(remove_special_chars)
194
-
195
- for model_name in ['minilm', 'mpnet', 'paraphrase']:
196
- df_filtered[f'embeddings_{model_name}'] = df_filtered['cleaned_description'].apply(lambda x: models[model_name].encode(x))
197
-
198
- uploaded_file = st.file_uploader("Upload your resume", type=["txt", "pdf"], key="resume_uploader")
199
- if uploaded_file is not None:
200
- try:
201
- resume_text = read_file_content(uploaded_file)
202
- cleaned_resume = remove_special_chars(resume_text)
203
- st.subheader("Parsed Resume")
204
- st.text(cleaned_resume)
205
-
206
- resume_embeddings = generate_embeddings(cleaned_resume, models)
207
-
208
- for model_name in ['minilm', 'mpnet', 'paraphrase']:
209
- similarities = calculate_similarities(df_filtered[f'embeddings_{model_name}'].tolist(), resume_embeddings[model_name])
210
- for metric in ['cosine', 'pearson', 'euclidean']:
211
- df_filtered[f'{model_name}_{metric}'] = [s[metric] for s in similarities]
212
-
213
- top_matches = get_top_matches(df_filtered, 50)
214
- st.subheader("Top 150 Matches (Before Groq Evaluation)")
215
- st.dataframe(top_matches[['title', 'company', 'location', 'model', 'metric']])
216
-
217
- groq_api_key = st.text_input("Enter your Groq API Key", type="password")
218
- if groq_api_key:
219
- client = groq.Groq(api_key=groq_api_key)
220
- st.subheader("Evaluating matches with Groq...")
221
- progress_bar = st.progress(0)
222
- for i, row in enumerate(top_matches.itertuples()):
223
- groq_result = evaluate_with_groq(cleaned_resume, row.description, client)
224
- top_matches.at[row.Index, 'groq_score'] = groq_result['score']
225
- top_matches.at[row.Index, 'groq_reasoning'] = groq_result['reasoning']
226
- progress_bar.progress((i + 1) / len(top_matches))
227
-
228
- top_100_matches = top_matches.nlargest(100, 'groq_score')
229
- st.subheader("Top 100 Matches After Groq Evaluation")
230
- display_data_explorer(top_100_matches)
231
-
232
- except Exception as e:
233
- st.error(f"An error occurred while processing the file: {str(e)}")
234
-
235
- if __name__ == "__main__":
236
- main()