Spaces:
Running
Running
File size: 5,093 Bytes
0d375ed 1ab9160 0d375ed 15d0f45 0d375ed 1ab9160 0d375ed 9be7e3f 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 4830708 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed 1ab9160 0d375ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
from flashtext import KeywordProcessor
import json
import nltk
from nltk.tokenize import word_tokenize,LineTokenizer
from utils import get_average_words_per_line, get_average_line_len
import wordninja
nltk.download('punkt')
class ResumeSegmenter():
def __init__(self):
#has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections
self.resume_segments = {
'objective': [],
'work_and_employment': [],
'education_and_training': [],
'skills': [],
'accomplishments': [],
'misc': []
}
self.resume_indices = []
with open(r"./sections.json") as f:
data = json.load(f)
self.section_headers = data["section_headers"]
f.close()
self.keyword_processor = KeywordProcessor()
self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers)
def find_segment_indices(self, text_list):
average_words_per_line = get_average_words_per_line(text_list)
average_sentence_length = get_average_line_len(text_list)
for i, line in enumerate(text_list):
line_tokenized = LineTokenizer(blanklines='discard').tokenize(line)
if line[0].islower() or line[-1] == '.':
continue
kys = self.keyword_processor.extract_keywords(line)
if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []:
text_list[i] = line = ' '.join(word_tokenize(line))
kys = self.keyword_processor.extract_keywords(line)
if len(kys) > 0:
if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length:
continue
self.resume_indices.append(i)
self.resume_segments[kys[0]].append(i)
def slice_segments(self, lines):
sections = {}
if len(self.resume_indices) == 0:
return None
for section, points in self.resume_segments.items():
if len(points) == 0: continue
start_point = points[0]
tmp_end_point = points[-1]
end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
len(self.resume_indices)-1)]
if start_point == self.resume_indices[-1]:
end_point = len(lines)
sections[section] = (start_point, end_point)
sections["basics_info"] = (0, self.resume_indices[0])
return sections
def get_interval_intersection(self, sections, interval):
for section in sections:
s = section[1]
if s[0] >= interval[1] or interval[0] >= s[1]:
return None
else:
start = max(s[0], interval[0])
end = min(s[1], interval[1])
return [start, end], section
def segment(self, resume_lines):
self.find_segment_indices(resume_lines)
sections = self.slice_segments(resume_lines)
if sections is None:
if len(self.resume_indices) == 0:
return None
else:
for key, value in self.resume_segments.items():
if len(value) > 0:
sections[key] = [min(value), max(value)]
sections["basics_info"] = (0, self.resume_indices[0])
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
"""intersection_intervals = []
for i, s in enumerate(sections_list[:-1]):
result = self.get_interval_intersection(sections_list[i+1:], s[1])
if result is None:
continue
else:
a,b = result
print(a,b,s[0])
intersection_intervals.append((a,b,s[0]))
if len(intersection_intervals) > 0:
print("there are intersections", intersection_intervals)"""
#needs last method of cleaning overlapping intervals with zero shot
#classifier + substract intervals
return sections
def get_parsed_sections(self, resume_lines):
text_segments = {}
sections = self.segment(resume_lines)
if sections is None:
return None, None
for header_title, section in sections.items():
lines = resume_lines[section[0]:section[1]]
text_segments[header_title] = lines
return text_segments, sections |