Spaces:

billusanda007
/

HireGPT

Sleeping

File size: 6,937 Bytes

import streamlit as st
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader
import os
from io import BytesIO
import pickle
import pdfminer
from pdfminer.high_level import extract_text
import re
import PyPDF2
import textract
import tempfile
from docx import Document

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    words = word_tokenize(text.lower())

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

def extract_text_from_pdf(pdf_content):
    pdf_reader = PdfReader(BytesIO(pdf_content))
    text = ''
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def extract_text_from_docx(docx_content):
    doc = Document(BytesIO(docx_content))
    text = " ".join(paragraph.text for paragraph in doc.paragraphs)
    return text


def extract_text_from_txt(txt_content):
    text = textract.process(input_filename=None, input_bytes=txt_content)
    return text

def extract_text_from_resume(file_path):
    file_extension = file_path.split('.')[-1].lower()

    if file_extension == 'pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == 'docx':
        return extract_text_from_docx(file_path)
    elif file_extension == 'txt':
        return extract_text_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

def clean_pdf_text(text):
    text = re.sub('http\S+\s*', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\S+', '', text)
    text = re.sub('@\S+', '  ', text)
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[^\x00-\x7f]',r' ', text)
    text = re.sub('\s+', ' ', text)
    return text

def extract_candidate_name(text):
    pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
    match = re.search(pattern, text)
    if match:
        return match.group(0)
    return "Candidate Name Not Found"

def calculate_similarity(job_description, cvs, cv_file_names):
    processed_job_desc = preprocess_text(job_description)

    processed_cvs = [preprocess_text(cv) for cv in cvs]

    all_text = [processed_job_desc] + processed_cvs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_text)

    similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]

    ranked_cvs = list(zip(cv_file_names, similarity_scores))
    ranked_cvs.sort(key=lambda x: x[1], reverse=True)

    return ranked_cvs

def extract_email_phone(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b'
    
    emails = re.findall(email_pattern, text)
    phones = re.findall(phone_pattern, text)
    
    return emails, phones



def rank_and_shortlist(job_description, cv_files, threshold=0.10):
    cv_texts = []
    cv_file_names = []
    cv_emails = []
    cv_phones = []

    for cv_file in cv_files:
        file_extension = os.path.splitext(cv_file.name)[1].lower()

        try:
            if file_extension == '.pdf':
                cv_text = extract_text_from_pdf(cv_file.read())
            elif file_extension == '.docx':
                cv_text = extract_text_from_docx(cv_file.read())
            elif file_extension == '.txt':
                cv_text = cv_file.read().decode('utf-8', errors='ignore')
            else:
                st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}")
                continue

            cv_texts.append(clean_pdf_text(cv_text))
            cv_file_names.append(cv_file.name)

            # Extract email and phone number from the CV text
            emails, phones = extract_email_phone(cv_text)
            cv_emails.append(emails)
            cv_phones.append(phones)

        except Exception as e:
            st.warning(f"Error processing file '{cv_file.name}': {str(e)}")
            continue

    if not cv_texts:
        st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).")
        return [], {}

    similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names)

    ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
    shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold]

    
    contact_info_dict = {}
    for cv_name, emails, phones in zip(cv_file_names, cv_emails, cv_phones):
        contact_info_dict[cv_name] = {
            'emails': emails,
            'phones': phones,
        }

    return ranked_cvs, shortlisted_cvs, contact_info_dict


def main():
    st.title("Resume Ranking App")

    st.write("Enter Job Title:")
    job_title = st.text_input("Job Title")

    st.write("Enter Job Description:")
    job_description = st.text_area("Job Description", height=200, key='job_description')

    st.write("Upload the Resumes:")
    cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files')

    if st.button("Submit"):
        if job_title and job_description and cv_files:
            
            job_description_text = f"{job_title} {job_description}"

            
            ranked_cvs, shortlisted_cvs, contact_info_dict = rank_and_shortlist(job_description_text, cv_files)

            
            st.markdown("### Ranking of Resumes:")
            for rank, score in ranked_cvs:
                st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")

            
            st.markdown("### Shortlisted Candidates:")
            if not shortlisted_cvs:  
                st.markdown("None")
            else:
                for rank, score in shortlisted_cvs:
                    st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
                    
                    contact_info = contact_info_dict[rank]
                    candidate_emails = contact_info.get('emails', [])
                    candidate_phones = contact_info.get('phones', [])
                    if candidate_emails:
                        st.markdown(f"**Emails:** {', '.join(candidate_emails)}")
                    if candidate_phones:
                        st.markdown(f"**Phone Numbers:** {', '.join(candidate_phones)}")

        else:
            st.error("Please enter the job title, job description, and upload resumes to proceed.")
    else:
        st.write("Please enter the job title, job description, and upload resumes to proceed.")

if __name__ == "__main__":
    main()