Spaces:
Running
Running
from itertools import chain | |
from models.prototype.models import Models | |
#from output_model import OutputModel, WorkExperience | |
from models.prototype.segmenter import ResumeSegmenter | |
from flashtext import KeywordProcessor | |
from collections import defaultdict | |
class ResumeParser(): | |
def __init__(self) -> None: | |
self.resumeSegmenter = ResumeSegmenter() | |
self.models = Models() | |
def get_date_index(self, clean_resume_lines, date): | |
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line] | |
return indexes | |
#better suited to a utils file | |
def sort_tokens_table(self, tokens_data): | |
table = {} | |
for key, tokens in tokens_data: | |
for token in tokens: | |
table[token] = key | |
return table | |
def split_work_exp(self, resume_lines, start_index, end_index, work_dates): | |
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates] | |
dates_indexes = list(chain.from_iterable(dates_indexes)) | |
dates_indexes = [i + start_index for i in dates_indexes] | |
#this list should be unique and ordered | |
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index]) | |
dates_indexes = set(dates_indexes) | |
dates_indexes = list(dates_indexes) | |
list_single_work_exp = [] | |
for i in range(len(dates_indexes)-1): | |
index = dates_indexes[i] | |
next_index = dates_indexes[i+1] | |
section = resume_lines[index:next_index] | |
if len(section) == 0: | |
continue | |
list_single_work_exp.append(section) | |
return list_single_work_exp | |
def extract_section_text(self, resume_lines, section_header = "work_and_employment"): | |
text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines) | |
start_index = sections[section_header][0] | |
end_index = sections[section_header][1] | |
#on the bases dates would be unique | |
return start_index, end_index | |
#more of a utils function | |
def sort_tokens_table(tokens_data): | |
table = {} | |
for key, tokens in tokens_data: | |
for token in tokens: | |
table[token] = key | |
return table | |
def format_output(self, keywords, work_section_list, isWorkExp=True): | |
if isWorkExp: | |
headlines = [text[0] for text in work_section_list] | |
else: | |
headlines = work_section_list | |
table = self.sort_tokens_table(keywords) | |
tokens_processor = KeywordProcessor() | |
list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords])) | |
tokens_processor.add_keywords_from_list(list_keywords) | |
data = [] | |
for i, header in enumerate(headlines): | |
current_data = defaultdict(list) | |
tokens = tokens_processor.extract_keywords(header) | |
for token in tokens: | |
current_data[table[token]].append(token) | |
if isWorkExp: | |
current_data["description"] = work_section_list[i][1:] | |
data.append(dict(current_data)) | |
return data | |
def parse_work_history(self, resume_lines): | |
start_index, end_index = self.extract_section_text(resume_lines) | |
work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date") | |
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates) | |
job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title") | |
companies = self.models.get_ner(resume_lines[start_index:end_index], "company") | |
keywords = [("date", work_dates), ("title", job_positions), ("company", companies)] | |
return self.format_output(keywords, single_work_experiences) | |
def parse_education(self, resume_lines): | |
start_index, end_index = self.extract_section_text(resume_lines, "education_and_training") | |
tokens = ["degree", "university", "degree field", "date", "location"] | |
for token in tokens: | |
keywords = self.get_ner(resume_lines[start_index+1:end_index], token) | |
output = self.format_output(keywords, resume_lines[start_index:end_index], False) | |
output = [res for res in output if res] | |
return output | |
def parse_basic_info(self,resume_lines): | |
start_index, end_index = self.extract_section_text(resume_lines, "basics_info") | |
#tokens = ["person", "email", "phone"] | |
tokens = ["person"] | |
for token in tokens: | |
keywords = self.models.get_ner(resume_lines[start_index:end_index], token) | |
output = {} | |
for token, result in keywords: | |
if len(result) > 0: | |
output[token] = result[0] | |
return output | |
def parse(self, resume_lines): | |
jobs = self.parse_work_history(resume_lines) | |
education = self.parse_education(resume_lines) | |
basic_info = self.parse_basic_info(resume_lines) | |
return {"basic_info":basic_info, "education":education, "work_experience":jobs} | |