Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

khrek commited on Nov 30, 2023

Commit

1ab9160

•

1 Parent(s): 77cdd98

Upload 2 files

Browse files

Files changed (2) hide show

resume_parser.py +98 -64
segmenter.py +34 -27

resume_parser.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from itertools import chain
 from models import Models
-#from models.prototype.models import Models
 #from output_model import OutputModel, WorkExperience
 from segmenter import ResumeSegmenter
 from flashtext import KeywordProcessor
 from collections import defaultdict
 class ResumeParser():
     def __init__(self) -> None:
         self.resumeSegmenter = ResumeSegmenter()
@@ -28,99 +31,130 @@ class ResumeParser():
         dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
         dates_indexes = list(chain.from_iterable(dates_indexes))
         dates_indexes = [i + start_index for i in dates_indexes]
-        #this list should be unique and ordered
         dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
         dates_indexes = set(dates_indexes)
-        dates_indexes =  list(dates_indexes)
-        list_single_work_exp = []
-        for i in range(len(dates_indexes)-1):
-            index = dates_indexes[i]
-            next_index = dates_indexes[i+1]
-            section = resume_lines[index:next_index]
             if len(section) == 0:
-              continue
-            list_single_work_exp.append(section)
-        return  list_single_work_exp
     def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
-        text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
         start_index = sections[section_header][0]
         end_index = sections[section_header][1]
         #on the bases dates would be unique
         return start_index, end_index
     #more of a utils function
-    def sort_tokens_table(tokens_data):
         table  = {}
         for key, tokens in tokens_data:
             for token in tokens:
                 table[token] = key
         return table
-    def format_output(self, keywords, work_section_list, isWorkExp=True):
-        if isWorkExp:
-            headlines = [text[0] for text in work_section_list]
-        else:
-            headlines = work_section_list
-        table = self.sort_tokens_table(keywords)
-        tokens_processor = KeywordProcessor()
-        list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
-        tokens_processor.add_keywords_from_list(list_keywords)
         data = []
-        for i, header in enumerate(headlines):
-            current_data = defaultdict(list)
-            tokens = tokens_processor.extract_keywords(header)
-            for token in tokens:
-                current_data[table[token]].append(token)
-            if isWorkExp:
-              current_data["description"] = work_section_list[i][1:]
-            data.append(dict(current_data))
         return data
-    def parse_work_history(self, resume_lines):
-        start_index, end_index = self.extract_section_text(resume_lines)
-        work_dates =  self.models.get_ner(resume_lines[start_index:end_index], "date")
         single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
-        job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
-        companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
-        keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
         return self.format_output(keywords, single_work_experiences)
-    def parse_education(self, resume_lines):
-        start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
-        tokens = ["degree", "university", "degree field", "date", "location"]
-        for token in tokens:
-            keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
-        output = self.format_output(keywords, resume_lines[start_index:end_index], False)
         output = [res for res in output if res]
         return output
-    def parse_basic_info(self,resume_lines):
-        start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
-        #tokens = ["person", "email", "phone"]
-        tokens = ["person"]
-        for token in tokens:
-            keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
         output = {}
-        for token, result in keywords:
-            if len(result) > 0:
-                output[token] = result[0]
         return output
     def parse(self, resume_lines):
-        jobs = self.parse_work_history(resume_lines)
-        education = self.parse_education(resume_lines)
-        basic_info = self.parse_basic_info(resume_lines)
-        return {"basic_info":basic_info, "education":education, "work_experience":jobs}

 from itertools import chain
 from models import Models
 #from output_model import OutputModel, WorkExperience
 from segmenter import ResumeSegmenter
 from flashtext import KeywordProcessor
 from collections import defaultdict
+import re
+import wordninja
+from utils import percentage_difference
+from nltk import word_tokenize
 class ResumeParser():
     def __init__(self) -> None:
         self.resumeSegmenter = ResumeSegmenter()
         dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
         dates_indexes = list(chain.from_iterable(dates_indexes))
         dates_indexes = [i + start_index for i in dates_indexes]
         dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
         dates_indexes = set(dates_indexes)
+        dates_indexes =  sorted(list(dates_indexes))
+        individual_sections = []
+        for i, index in enumerate(dates_indexes):
+            section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
             if len(section) == 0:
+                continue
+            individual_sections.append(section)
+        return  individual_sections
     def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
+        _ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
+        if sections is None:
+            return None
+        print(sections)
+        if section_header not in sections:
+          return None
         start_index = sections[section_header][0]
         end_index = sections[section_header][1]
         #on the bases dates would be unique
         return start_index, end_index
     #more of a utils function
+    def sort_tokens_table(self, tokens_data):
         table  = {}
         for key, tokens in tokens_data:
             for token in tokens:
                 table[token] = key
         return table
+    def format_output(self, keywords, headlines, isWorkExp=True):
         data = []
+        for section in headlines:
+            extracted_data = {}
+            paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
+            extracted_data['description'] = paragraph
+            recovered_headlines = ' '.join(wordninja.split(paragraph))
+            if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
+                extracted_data['description'] = recovered_headlines
+            for attr in keywords:
+                result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
+                if len(result) > 0:
+                    extracted_data[attr[0]] = result
+            data.append(extracted_data)
         return data
+    def parse_work_history(self, resume_lines, sections):
+        start_index, end_index = sections['work_and_employment']
+        text = ' '.join(resume_lines[start_index:end_index])
+        recovered_text = ' '.join(wordninja.split(text))
+        work_dates, companies, locations = self.models.get_ner(text, recovered_text)
         single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
+        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
+        job_positions = entity_dict['job title']
+        keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
         return self.format_output(keywords, single_work_experiences)
+    def parse_education(self, resume_lines, sections):
+        start_index, end_index = sections["education_and_training"]
+        text = ' '.join(resume_lines[start_index:end_index])
+        dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
+        single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
+        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
+        degrees = entity_dict['degree']
+        majors = entity_dict['major']
+        keywords = [("date", dates), ("major", majors), ("degree", degrees),
+                    ("university", universities), ("location", locations)]
+        output = self.format_output(keywords, single_education_experiences, False)
         output = [res for res in output if res]
         return output
+    def parse_basic_info(self,resume_lines, sections):
+        start_index, end_index = sections["basics_info"]
+        text = ' '.join(resume_lines[start_index:end_index])
+        phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
+        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        entites = self.models.ner(text)
+        if len(entites) == 0:
+            entites = self.models.ner(' '.join(resume_lines))
         output = {}
+        score = 0
+        for entity in entites:
+          if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
+            output['name']= entity['word']
+            score = entity['score']
+        email = re.findall(email_pattern, text)
+        phone = re.findall(phone_pattern, text)
+        if email == '':
+            email = re.findall(email_pattern, ' '.join(resume_lines))
+        if phone == '':
+            phone = re.findall(phone_pattern, ' '.join(resume_lines))
+        output['email'] = email[0] if len(email) > 0 else ''
+        output['phone'] = phone[0] if len(phone) > 0 else ''
         return output
     def parse(self, resume_lines):
+        self.resumeSegmenter.resume_segments = {
+                'objective': [],
+                'work_and_employment': [],
+                'education_and_training': [],
+                'skills': [],
+                'accomplishments': [],
+                'misc': []
+            }
+        self.resumeSegmenter.resume_indices = []
+        sections = self.resumeSegmenter.segment(resume_lines)
+        if sections is None:
+            return {}
+        jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
+        education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
+        basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
+        result =  {"basic_info":basic_info, "education":education, "work_experience":jobs}
+        for section in sections.keys():
+            if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
+                text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
+                result[section] =' '.join(wordninja.split(text))
+        return result

segmenter.py CHANGED Viewed

@@ -1,8 +1,14 @@
 from flashtext import KeywordProcessor
 import json
 class ResumeSegmenter():
         def __init__(self):
             self.resume_segments = {
                 'objective': [],
                 'work_and_employment': [],
@@ -12,41 +18,37 @@ class ResumeSegmenter():
                 'misc': []
             }
             self.resume_indices = []
-        def get_average_line_len(self, lines):
-            sum = 0
-            for line in lines:
-                sum+=len(line)
-            return sum / len(lines)
-        def get_average_words_per_line(self, lines):
-            sum = 0
-            for line in lines:
-                #other stopwords too?
-                sum+= len(line.split(' '))
-            return sum/ len(lines)
         def find_segment_indices(self, text_list):
-            with open(r"./sections.json") as f:
-                data = json.load(f)
-                section_headers = data["section_headers"]
-            f.close()
-            keyword_processor = KeywordProcessor()
-            keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
-            average_words_per_line = self.get_average_words_per_line(text_list)
             for i, line in enumerate(text_list):
                 if line[0].islower() or line[-1] == '.':
                     continue
-                kys =  keyword_processor.extract_keywords(line)
                 if len(kys) > 0:
-                    #other stopwords? from where? nltk lib ? pos tagger?
-                    if len(line.split(" ")) > average_words_per_line * 0.75:
                         continue
-                    #is it necessary to keep the actual raw keyword?
                     self.resume_indices.append(i)
                     self.resume_segments[kys[0]].append(i)
         def slice_segments(self, lines):
             sections = {}
             if len(self.resume_indices) == 0:
@@ -73,12 +75,14 @@ class ResumeSegmenter():
                     start = max(s[0], interval[0])
                     end = min(s[1], interval[1])
                     return [start, end], section
         def segment(self, resume_lines):
             self.find_segment_indices(resume_lines)
             sections = self.slice_segments(resume_lines)
-            #whats the naming convention here sections_list or list_sections???
             sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
-            intersection_intervals = []
             for i, s in enumerate(sections_list[:-1]):
                 result = self.get_interval_intersection(sections_list[i+1:], s[1])
@@ -90,14 +94,17 @@ class ResumeSegmenter():
                     intersection_intervals.append((a,b,s[0]))
             if len(intersection_intervals) > 0:
-                print("there are intersections", intersection_intervals)
             #needs last method of cleaning overlapping intervals with zero shot
             #classifier + substract intervals
             return sections
         def get_parsed_sections(self, resume_lines):
             text_segments = {}
             sections = self.segment(resume_lines)
             for header_title, section in sections.items():
                 lines = resume_lines[section[0]:section[1]]
                 text_segments[header_title] = lines

 from flashtext import KeywordProcessor
 import json
+import nltk
+from nltk.tokenize import word_tokenize,LineTokenizer
+from utils import get_average_words_per_line, get_average_line_len
+import wordninja
+nltk.download('punkt')
 class ResumeSegmenter():
         def __init__(self):
+            #has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections
             self.resume_segments = {
                 'objective': [],
                 'work_and_employment': [],
                 'misc': []
             }
             self.resume_indices = []
+            with open(r"models/prototype/sections.json") as f:
+                data = json.load(f)
+                self.section_headers = data["section_headers"]
+            f.close()
+            self.keyword_processor = KeywordProcessor()
+            self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers)
         def find_segment_indices(self, text_list):
+            average_words_per_line = get_average_words_per_line(text_list)
+            average_sentence_length = get_average_line_len(text_list)
             for i, line in enumerate(text_list):
+                line_tokenized = LineTokenizer(blanklines='discard').tokenize(line)
                 if line[0].islower() or line[-1] == '.':
                     continue
+                kys =  self.keyword_processor.extract_keywords(line)
+                if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []:
+                    text_list[i] = line = ' '.join(word_tokenize(line))
+                    kys =  self.keyword_processor.extract_keywords(line)
                 if len(kys) > 0:
+                    if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length:
                         continue
                     self.resume_indices.append(i)
                     self.resume_segments[kys[0]].append(i)
         def slice_segments(self, lines):
             sections = {}
             if len(self.resume_indices) == 0:
                     start = max(s[0], interval[0])
                     end = min(s[1], interval[1])
                     return [start, end], section
         def segment(self, resume_lines):
             self.find_segment_indices(resume_lines)
             sections = self.slice_segments(resume_lines)
+            if sections is None:
+                return None
             sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
+            """intersection_intervals = []
             for i, s in enumerate(sections_list[:-1]):
                 result = self.get_interval_intersection(sections_list[i+1:], s[1])
                     intersection_intervals.append((a,b,s[0]))
             if len(intersection_intervals) > 0:
+                print("there are intersections", intersection_intervals)"""
             #needs last method of cleaning overlapping intervals with zero shot
             #classifier + substract intervals
             return sections
         def get_parsed_sections(self, resume_lines):
             text_segments = {}
             sections = self.segment(resume_lines)
+            if sections is None:
+                return None, None
             for header_title, section in sections.items():
                 lines = resume_lines[section[0]:section[1]]
                 text_segments[header_title] = lines