import gradio as gr import spacy import re import pdfplumber import docx import nltk from nltk.corpus import words from spacy.cli import download # Download the SpaCy model if it doesn't exist download("en_core_web_sm") # Load the spaCy model nlp = spacy.load("en_core_web_sm") # Set of English words nltk.download('words', quiet=True) english_words = set(words.words()) def extract_text(file): try: if file.name.endswith('.pdf'): return extract_text_from_pdf(file) elif file.name.endswith('.docx'): return extract_text_from_docx(file) else: return "Unsupported file format" except Exception as e: return f"Error extracting text: {str(e)}" def extract_text_from_pdf(file): text = '' with pdfplumber.open(file) as pdf: for page in pdf.pages: text += page.extract_text() or '' return text def extract_text_from_docx(file): doc = docx.Document(file) return "\n".join([para.text for para in doc.paragraphs]) def extract_companies(text): doc = nlp(text) companies = [] company_pattern = re.compile( r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE) for ent in doc.ents: if ent.label_ == "ORG" and company_pattern.search(ent.text): companies.append(ent.text) # Join companies with new lines return "\n".join(companies) def extract_colleges(text): doc = nlp(text) colleges = [] edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"] for sent in doc.sents: edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)] for edu in edu_ents: colleges.append(edu.text) # Join colleges with new lines return "\n".join(colleges) def extract_years_of_experience(text): years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE) months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE) total_years = sum(map(int, years)) total_months = sum(map(int, months)) total_experience_years = total_years + (total_months // 12) total_experience_months = total_months % 12 return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available" def extract_phone(text): phone_patterns = [ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b', r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' ] for pattern in phone_patterns: match = re.search(pattern, text) if match: return match.group() return "Not found" def extract_email(text): email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' match = re.search(email_pattern, text) return match.group() if match else "Not found" def extract_summary(doc): sentences = list(doc.sents) summary = [] for sent in sentences: if len(summary) >= 3: # Limit to 3 sentences break if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7: summary.append(sent.text) return " ".join(summary) def extract_linkedin(text): linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?' match = re.search(linkedin_pattern, text) return match.group() if match else "Not found" def parse_resume(file): try: text = extract_text(file) if text.startswith("Error") or text == "Unsupported file format": return {"Error": text} doc = nlp(text) companies = extract_companies(text) colleges = extract_colleges(text) years_of_experience = extract_years_of_experience(text) phone = extract_phone(text) email = extract_email(text) summary = extract_summary(doc) linkedin = extract_linkedin(text) return companies, colleges, years_of_experience, phone, email, summary, linkedin except Exception as e: import traceback return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" # Create Gradio interface with separate output components iface = gr.Interface( fn=parse_resume, inputs=gr.File(label="Upload Resume (PDF or DOCX)"), outputs=[ gr.Textbox(label="Companies Worked For", lines=10), gr.Textbox(label="Colleges Attended", lines=10), gr.Textbox(label="Years of Experience"), gr.Textbox(label="Phone Number"), gr.Textbox(label="Email ID"), gr.Textbox(label="Summary", lines=3), gr.Textbox(label="LinkedIn ID") ], title="Advanced Resume Parser", description="Upload a resume in PDF or DOCX format to extract key information." ) iface.launch(share=True)