import pypdfium2 as pdfium import re from PIL import Image from pytesseract import image_to_string from utils import recover_text, get_average_line_len import pdfplumber class ResumeReader: def clean_text(self, raw_text): clean_text = re.sub(r'\n+', '\n', raw_text) clean_text = clean_text.replace("\r", "\n") clean_text = clean_text.replace("\t", " ") clean_text = re.sub(r"\uf0b7", " ", clean_text) clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text) clean_text = re.sub(r'• ', " ", clean_text) return clean_text def read_image(self, path_file): raw_text = str(image_to_string(Image.open(path_file))) clean_text = self.clean_text(raw_text) resume_lines = clean_text.splitlines(True) resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] #avg_line = get_average_line_len(resume_lines) #resume_lines = [recover_text(line,avg_line) for line in resume_lines] return resume_lines def read_pdf(self, path_file): raw_text = "" with pdfplumber.open(path_file) as pdf: # Extract text from all pages for page_number in range(len(pdf.pages)): page = pdf.pages[page_number] raw_text += page.extract_text() clean_text = self.clean_text(raw_text) resume_lines = clean_text.splitlines(True) resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] #avg_line = get_average_line_len(resume_lines) #resume_lines = [recover_text(line,avg_line) for line in resume_lines] return resume_lines def read(self, path_file): if path_file.endswith('.pdf'): return self.read_pdf(path_file) elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'): return self.read_image(path_file) else: print("Unsupported file format") return None