detailed-resume-parser / resume_parser.py
khrek's picture
Upload 2 files
1ab9160
raw
history blame
7.38 kB
from itertools import chain
from models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
import re
import wordninja
from utils import percentage_difference
from nltk import word_tokenize
class ResumeParser():
def __init__(self) -> None:
self.resumeSegmenter = ResumeSegmenter()
self.models = Models()
def get_date_index(self, clean_resume_lines, date):
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
return indexes
#better suited to a utils file
def sort_tokens_table(self, tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
dates_indexes = list(chain.from_iterable(dates_indexes))
dates_indexes = [i + start_index for i in dates_indexes]
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
dates_indexes = set(dates_indexes)
dates_indexes = sorted(list(dates_indexes))
individual_sections = []
for i, index in enumerate(dates_indexes):
section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
if len(section) == 0:
continue
individual_sections.append(section)
return individual_sections
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
_ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
if sections is None:
return None
print(sections)
if section_header not in sections:
return None
start_index = sections[section_header][0]
end_index = sections[section_header][1]
#on the bases dates would be unique
return start_index, end_index
#more of a utils function
def sort_tokens_table(self, tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def format_output(self, keywords, headlines, isWorkExp=True):
data = []
for section in headlines:
extracted_data = {}
paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
extracted_data['description'] = paragraph
recovered_headlines = ' '.join(wordninja.split(paragraph))
if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
extracted_data['description'] = recovered_headlines
for attr in keywords:
result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
if len(result) > 0:
extracted_data[attr[0]] = result
data.append(extracted_data)
return data
def parse_work_history(self, resume_lines, sections):
start_index, end_index = sections['work_and_employment']
text = ' '.join(resume_lines[start_index:end_index])
recovered_text = ' '.join(wordninja.split(text))
work_dates, companies, locations = self.models.get_ner(text, recovered_text)
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
job_positions = entity_dict['job title']
keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
return self.format_output(keywords, single_work_experiences)
def parse_education(self, resume_lines, sections):
start_index, end_index = sections["education_and_training"]
text = ' '.join(resume_lines[start_index:end_index])
dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
degrees = entity_dict['degree']
majors = entity_dict['major']
keywords = [("date", dates), ("major", majors), ("degree", degrees),
("university", universities), ("location", locations)]
output = self.format_output(keywords, single_education_experiences, False)
output = [res for res in output if res]
return output
def parse_basic_info(self,resume_lines, sections):
start_index, end_index = sections["basics_info"]
text = ' '.join(resume_lines[start_index:end_index])
phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
entites = self.models.ner(text)
if len(entites) == 0:
entites = self.models.ner(' '.join(resume_lines))
output = {}
score = 0
for entity in entites:
if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
output['name']= entity['word']
score = entity['score']
email = re.findall(email_pattern, text)
phone = re.findall(phone_pattern, text)
if email == '':
email = re.findall(email_pattern, ' '.join(resume_lines))
if phone == '':
phone = re.findall(phone_pattern, ' '.join(resume_lines))
output['email'] = email[0] if len(email) > 0 else ''
output['phone'] = phone[0] if len(phone) > 0 else ''
return output
def parse(self, resume_lines):
self.resumeSegmenter.resume_segments = {
'objective': [],
'work_and_employment': [],
'education_and_training': [],
'skills': [],
'accomplishments': [],
'misc': []
}
self.resumeSegmenter.resume_indices = []
sections = self.resumeSegmenter.segment(resume_lines)
if sections is None:
return {}
jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
result = {"basic_info":basic_info, "education":education, "work_experience":jobs}
for section in sections.keys():
if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
result[section] =' '.join(wordninja.split(text))
return result