Spaces:
Sleeping
Sleeping
File size: 3,687 Bytes
8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 8322a94 af2aec4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import MinMaxScaler
import re
from PyPDF2 import PdfReader
def extract_text_from_file(file):
if file.type == "application/pdf":
return extract_text_from_pdf(file)
else:
return file.read().decode('utf-8')
def extract_text_from_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def clean_text(text):
text = re.sub(r'\W', ' ', text)
return text.lower()
def calculate_similarity_metrics(resumes, keywords):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
def jaccard_similarity(doc1, doc2):
set1 = set(doc1.split())
set2 = set(doc2.split())
return len(set1.intersection(set2)) / len(set1.union(set2))
jaccard_sim = [jaccard_similarity(keywords, resume) for resume in resumes]
euclidean_dist = euclidean_distances(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
euclidean_sim = 1 / (1 + euclidean_dist)
return cosine_sim, jaccard_sim, euclidean_sim
st.title("Resume Analyzer")
st.sidebar.subheader("Enter Keywords and Priority")
data = pd.DataFrame({
'Keyword': ['']*10,
'Priority': ['']*10
})
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")
if not keywords_df['Keyword'].isnull().all():
keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
st.subheader("Upload up to 5 resumes (PDF or Text files)")
uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
if len(uploaded_files) > 0 and keywords_combined:
with st.spinner("Analyzing Resumes..."):
resumes = []
for file in uploaded_files:
try:
resume_text = extract_text_from_file(file)
clean_resume = clean_text(resume_text)
resumes.append(clean_resume)
except Exception as e:
st.error(f"Error processing {file.name}: {str(e)}")
clean_keywords = clean_text(keywords_combined)
cosine_scores, jaccard_scores, euclidean_scores = calculate_similarity_metrics(resumes, clean_keywords)
st.subheader("Resume Analysis Results")
results_df = pd.DataFrame({
'Resume': [file.name for file in uploaded_files],
'Cosine Similarity': cosine_scores,
'Jaccard Index': jaccard_scores,
'Euclidean Similarity': euclidean_scores
})
scaler = MinMaxScaler()
normalized_scores = scaler.fit_transform(results_df[['Cosine Similarity', 'Jaccard Index', 'Euclidean Similarity']])
overall_scores = np.mean(normalized_scores, axis=1)
results_df['Overall Score'] = overall_scores
results_df['Rank'] = results_df['Overall Score'].rank(ascending=False, method='min').astype(int)
results_df = results_df.sort_values('Rank')
st.dataframe(results_df)
else:
st.info("Please upload resumes and enter keywords with priority.") |