Spaces:
Build error
Build error
Commit
Β·
4770145
1
Parent(s):
57ed03b
Create careermatcher.py
Browse files- careermatcher.py +241 -0
careermatcher.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import plotly.express as px
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import pickle as pkl
|
6 |
+
import spacy
|
7 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
8 |
+
nlp = spacy.load('en_core_web_lg')
|
9 |
+
import re
|
10 |
+
import docx2txt
|
11 |
+
from spacy.matcher import PhraseMatcher
|
12 |
+
|
13 |
+
# from transformers import BertForSequenceClassification
|
14 |
+
# from transformers import BertTokenizer
|
15 |
+
# Load model directly
|
16 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
17 |
+
|
18 |
+
# tokenizer = AutoTokenizer.from_pretrained("liberatoratif/BERT-resume-job-recommender")
|
19 |
+
# model = AutoModelForSequenceClassification.from_pretrained("liberatoratif/BERT-resume-job-recommender")
|
20 |
+
|
21 |
+
matcher = PhraseMatcher(nlp.vocab)
|
22 |
+
|
23 |
+
import torch
|
24 |
+
|
25 |
+
st.set_page_config(
|
26 |
+
page_title="Resume Scanner",
|
27 |
+
page_icon="π",
|
28 |
+
layout="wide",
|
29 |
+
initial_sidebar_state="expanded",
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
# output_dir = "model_save"
|
34 |
+
enc_dir = "target_encodings.pkl"
|
35 |
+
matcher_dir = "linkedin_skill.txt"
|
36 |
+
|
37 |
+
|
38 |
+
# @st.cache
|
39 |
+
def bert():
|
40 |
+
# model_loaded_temp = BertForSequenceClassification.from_pretrained(output_dir)
|
41 |
+
model_loaded_temp = AutoModelForSequenceClassification.from_pretrained("liberatoratif/BERT-resume-job-recommender")
|
42 |
+
return model_loaded_temp
|
43 |
+
|
44 |
+
# @st.cache
|
45 |
+
def bert_token():
|
46 |
+
# tokenizer_loaded_temp = BertTokenizer.from_pretrained(output_dir)
|
47 |
+
tokenizer_loaded_temp = AutoTokenizer.from_pretrained("liberatoratif/BERT-resume-job-recommender")
|
48 |
+
return tokenizer_loaded_temp
|
49 |
+
|
50 |
+
# @st.cache
|
51 |
+
def label_enc():
|
52 |
+
enc = pkl.load(open(enc_dir, 'rb'))
|
53 |
+
return enc
|
54 |
+
|
55 |
+
# @st.cache
|
56 |
+
def ph_match():
|
57 |
+
with open(matcher_dir, 'r', encoding='utf-8') as file:
|
58 |
+
text = file.read()
|
59 |
+
|
60 |
+
return text
|
61 |
+
|
62 |
+
|
63 |
+
label_encoder = label_enc()
|
64 |
+
model_loaded = bert()
|
65 |
+
tokenizer_loaded = bert_token()
|
66 |
+
|
67 |
+
txt = ph_match()
|
68 |
+
|
69 |
+
st.markdown(
|
70 |
+
"""
|
71 |
+
<style>
|
72 |
+
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child {
|
73 |
+
width: 250px;
|
74 |
+
}
|
75 |
+
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child {
|
76 |
+
width: 150px;
|
77 |
+
margin-left: -500px;
|
78 |
+
}
|
79 |
+
</style>
|
80 |
+
""",
|
81 |
+
unsafe_allow_html=True,
|
82 |
+
)
|
83 |
+
|
84 |
+
st.markdown("<h1 style='text-align: centre; color: cyan;'>RESUME/CV SCANNER</h1>",
|
85 |
+
unsafe_allow_html=True)
|
86 |
+
st.markdown("<h6 style='text-align: centre; color: white;'>Know which domain fit's your resume :)</h1>",
|
87 |
+
unsafe_allow_html=True)
|
88 |
+
|
89 |
+
stops = list(STOP_WORDS)
|
90 |
+
|
91 |
+
def extract_text_from_docx(docx_path):
|
92 |
+
txt = docx2txt.process(docx_path)
|
93 |
+
if txt:
|
94 |
+
return txt.replace('\t', ' ')
|
95 |
+
return None
|
96 |
+
|
97 |
+
def cleanResume(resumeText):
|
98 |
+
resumeText = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",resumeText).split())
|
99 |
+
resumeText = re.sub(r'[^\x00-\x7F]+',r' ', resumeText)
|
100 |
+
resumeText = ''.join(resumeText.splitlines())
|
101 |
+
return resumeText
|
102 |
+
|
103 |
+
def complete_pack(x):
|
104 |
+
demo = nlp(x)
|
105 |
+
lst = [i.text.lower() for i in demo if i.text.lower() not in stops]
|
106 |
+
return lst
|
107 |
+
|
108 |
+
|
109 |
+
with st.sidebar:
|
110 |
+
global resume_text, upload
|
111 |
+
global resume_text_spacy, re_temp
|
112 |
+
upload = st.file_uploader("DRAG AND DROP YOUR RESUME NOW")
|
113 |
+
st.markdown("<h5 style='text-align: centre; color: red;'>Only .docx type files accepted</h1>",
|
114 |
+
unsafe_allow_html=True)
|
115 |
+
if upload:
|
116 |
+
try:
|
117 |
+
resume_text = extract_text_from_docx(upload)
|
118 |
+
resume_text = resume_text.replace('\n\n', ' ')
|
119 |
+
re_temp = cleanResume(resume_text)
|
120 |
+
resume_text_spacy = nlp(re_temp)
|
121 |
+
except Exception as e:
|
122 |
+
st.error('WRONG FILE FORMAT : Only .docx(WORD DOC) type of files are accepted')
|
123 |
+
|
124 |
+
|
125 |
+
scan = st.button('SCAN π')
|
126 |
+
if scan:
|
127 |
+
try:
|
128 |
+
emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", resume_text)
|
129 |
+
phone = re.findall(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', resume_text)
|
130 |
+
links = re.findall(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»ββββ]))", resume_text)
|
131 |
+
|
132 |
+
|
133 |
+
txt = txt.split('\n')
|
134 |
+
ev = [nlp.make_doc(i) for i in txt]
|
135 |
+
matcher.add("SKILLS", None, *ev)
|
136 |
+
get_skills = matcher(resume_text_spacy)
|
137 |
+
|
138 |
+
demo = []
|
139 |
+
for match_id, start, end in get_skills:
|
140 |
+
span = resume_text_spacy[start : end]
|
141 |
+
demo.append(span.text)
|
142 |
+
|
143 |
+
re_text = ' '.join(demo)
|
144 |
+
my_skills_re_text = re_text
|
145 |
+
my_skills_clean_re_text = cleanResume(my_skills_re_text)
|
146 |
+
|
147 |
+
skills = complete_pack(my_skills_clean_re_text)
|
148 |
+
skills = ' '.join(skills)
|
149 |
+
lst = []
|
150 |
+
lst.append(skills)
|
151 |
+
|
152 |
+
|
153 |
+
model_loaded.eval()
|
154 |
+
|
155 |
+
# Tokenize the input text
|
156 |
+
input_ids = tokenizer_loaded.encode(lst[0], add_special_tokens=True)
|
157 |
+
input_ids = torch.tensor(input_ids).unsqueeze(0) # Add batch dimension
|
158 |
+
|
159 |
+
# Move the input tensor to the same device as the model
|
160 |
+
# input_ids = input_ids.to(device)
|
161 |
+
# model_loaded = model_loaded.to(device)
|
162 |
+
|
163 |
+
# Perform the forward pass to get the model's predictions
|
164 |
+
with torch.no_grad():
|
165 |
+
result = model_loaded(input_ids, token_type_ids=None, attention_mask=None, return_dict=True)
|
166 |
+
logits = result.logits
|
167 |
+
|
168 |
+
# Move the logits to the CPU and convert to numpy array
|
169 |
+
logits = logits.detach().cpu().numpy()
|
170 |
+
|
171 |
+
# Get the predicted label
|
172 |
+
predicted_label = np.argmax(logits)
|
173 |
+
|
174 |
+
# Print the predicted label
|
175 |
+
# st.write("Predicted Label:", predicted_label)
|
176 |
+
|
177 |
+
probs = logits[0]
|
178 |
+
# print("text:", lst[0])
|
179 |
+
# print("predictions:", probs)
|
180 |
+
pred_idx = np.argmax(probs)
|
181 |
+
# kp = list(pred_idx)
|
182 |
+
d = {}
|
183 |
+
ind = 0
|
184 |
+
|
185 |
+
for i in probs:
|
186 |
+
d[label_encoder.inverse_transform([ind])[0]] = i
|
187 |
+
ind+=1
|
188 |
+
# st.write("Your skills are matching to : ", label_encoder.inverse_transform([pred_idx])[0])
|
189 |
+
domain = label_encoder.inverse_transform([pred_idx])[0]
|
190 |
+
data = pd.DataFrame({'Domains' : list(d.keys()), 'Probs' : list(d.values())})
|
191 |
+
# st.markdown(f"**Your skills are matching to:** <span style='color: cyan;'>{domain}</span>", unsafe_allow_html=True) #BF3EFF
|
192 |
+
st.markdown(f"<span style='color: #BF3EFF;'>**Your skills are matching to :**</span> <span style='color: #54FF9F;'>{domain}</span>", unsafe_allow_html=True)
|
193 |
+
datacpy = data.copy()
|
194 |
+
datacpy['Probs'] = datacpy['Probs']*10
|
195 |
+
datacpy.rename(columns={'Probs': 'Percentage Prediction of your Domain'}, inplace=True)
|
196 |
+
|
197 |
+
st.markdown("<h3 style='text-align: centre; color: blue;'>PERCENT OF YOUR DOMAIN MATCH</h3>",
|
198 |
+
unsafe_allow_html=True)
|
199 |
+
|
200 |
+
st.dataframe(datacpy.sort_values('Percentage Prediction of your Domain', ascending=False))
|
201 |
+
domains = px.bar(data, x = 'Domains', y = 'Probs',width=800, height=400)
|
202 |
+
st.plotly_chart(domains)
|
203 |
+
|
204 |
+
|
205 |
+
if len(list(set(emails))) > 0:
|
206 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>EMAIL βοΈ </h1>",
|
207 |
+
unsafe_allow_html=True)
|
208 |
+
st.success(list(set(emails)))
|
209 |
+
else:
|
210 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>EMAIL β </h1>",
|
211 |
+
unsafe_allow_html=True)
|
212 |
+
st.error('Email-Id is not present try including it in your Resume')
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
if len(list(set(phone))) > 0:
|
217 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>MOBILE NO βοΈ </h1>",
|
218 |
+
unsafe_allow_html=True)
|
219 |
+
st.success(list(set(phone)))
|
220 |
+
else:
|
221 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>MOBILE NO β </h1>",
|
222 |
+
unsafe_allow_html=True)
|
223 |
+
st.error('Mobile number is not present try including it in your Resume')
|
224 |
+
|
225 |
+
|
226 |
+
|
227 |
+
if len(list(set(links))) > 0:
|
228 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>LINKS βοΈ </h1>",
|
229 |
+
unsafe_allow_html=True)
|
230 |
+
st.success(list(set(links)))
|
231 |
+
else:
|
232 |
+
st.markdown("<h4 style='text-align: centre; color: blue;'>LINKS β</h1>",
|
233 |
+
unsafe_allow_html=True)
|
234 |
+
st.error("Link's are not present try including your Github or LinkedIn Profile in your Resume")
|
235 |
+
|
236 |
+
|
237 |
+
except Exception as e:
|
238 |
+
st.write(e)
|
239 |
+
st.error("π² Try uploading your file again")
|
240 |
+
|
241 |
+
|