from flashtext import KeywordProcessor import json import nltk from nltk.tokenize import word_tokenize,LineTokenizer from utils import get_average_words_per_line, get_average_line_len import wordninja nltk.download('punkt') class ResumeSegmenter(): def __init__(self): #has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections self.resume_segments = { 'objective': [], 'work_and_employment': [], 'education_and_training': [], 'skills': [], 'accomplishments': [], 'misc': [] } self.resume_indices = [] with open(r"models/prototype/sections.json") as f: data = json.load(f) self.section_headers = data["section_headers"] f.close() self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers) def find_segment_indices(self, text_list): average_words_per_line = get_average_words_per_line(text_list) average_sentence_length = get_average_line_len(text_list) for i, line in enumerate(text_list): line_tokenized = LineTokenizer(blanklines='discard').tokenize(line) if line[0].islower() or line[-1] == '.': continue kys = self.keyword_processor.extract_keywords(line) if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []: text_list[i] = line = ' '.join(word_tokenize(line)) kys = self.keyword_processor.extract_keywords(line) if len(kys) > 0: if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length: continue self.resume_indices.append(i) self.resume_segments[kys[0]].append(i) def slice_segments(self, lines): sections = {} if len(self.resume_indices) == 0: return None for section, points in self.resume_segments.items(): if len(points) == 0: continue start_point = points[0] tmp_end_point = points[-1] end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1, len(self.resume_indices)-1)] if start_point == self.resume_indices[-1]: end_point = len(lines) sections[section] = (start_point, end_point) sections["basics_info"] = (0, self.resume_indices[0]) return sections def get_interval_intersection(self, sections, interval): for section in sections: s = section[1] if s[0] >= interval[1] or interval[0] >= s[1]: return None else: start = max(s[0], interval[0]) end = min(s[1], interval[1]) return [start, end], section def segment(self, resume_lines): self.find_segment_indices(resume_lines) sections = self.slice_segments(resume_lines) if sections is None: return None sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ] """intersection_intervals = [] for i, s in enumerate(sections_list[:-1]): result = self.get_interval_intersection(sections_list[i+1:], s[1]) if result is None: continue else: a,b = result print(a,b,s[0]) intersection_intervals.append((a,b,s[0])) if len(intersection_intervals) > 0: print("there are intersections", intersection_intervals)""" #needs last method of cleaning overlapping intervals with zero shot #classifier + substract intervals return sections def get_parsed_sections(self, resume_lines): text_segments = {} sections = self.segment(resume_lines) if sections is None: return None, None for header_title, section in sections.items(): lines = resume_lines[section[0]:section[1]] text_segments[header_title] = lines return text_segments, sections