import os import PyPDF2 from transformers import BertTokenizer, BertModel from transformers import LongformerModel, LongformerTokenizer from transformers import BigBirdModel, BigBirdTokenizer import numpy as np import gradio as gr from pathlib import Path import torch # Load BERT tokenizer and model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') # Load the BigBird model and tokenizer tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base') model = BigBirdModel.from_pretrained('google/bigbird-roberta-base') #longformer # Load the Longformer model and tokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerModel.from_pretrained('allenai/longformer-base-4096') #longFormer def get_longformer_embedding(text): # Tokenize the text inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096) # Get the embeddings from Longformer with torch.no_grad(): outputs = model(**inputs) # Use the [CLS] token's embedding as the aggregate representation cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() return cls_embedding # BIGBIRD def get_bigbird_embedding(text): # Tokenize the text inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096) # Get the embeddings from BigBird with torch.no_grad(): outputs = model(**inputs) # Use the [CLS] token's embedding as the aggregate representation cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() return cls_embedding def get_bert_embedding(text): # Tokenize the text inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) # Get the embeddings from BERT with torch.no_grad(): outputs = model(**inputs) # Use the [CLS] token's embedding as the aggregate representation cls_embedding = outputs.last_hidden_state[:, 0, :].numpy() return cls_embedding def process_folder(file): folder_path = os.path.dirname(file.name) # Get the directory of the selected file files = os.listdir(folder_path) # List all files in the directory file_paths = [os.path.join(folder_path, f) for f in files] # Get full paths of all files return f"Files in folder: {', '.join(files)}" # Function to extract text from a PDF def extract_text_from_pdf(pdf_file): text = '' with open(pdf_file, 'rb') as file: reader = PyPDF2.PdfReader(file) for page in reader.pages: text += page.extract_text() or '' return text def calculate_cosine(embedding1, embedding2): # Calculate the dot product and magnitudes of the embeddings dot_product = np.dot(embedding1, embedding2) magnitude1 = np.linalg.norm(embedding1) magnitude2 = np.linalg.norm(embedding2) # Calculate cosine similarity similarity = dot_product / (magnitude1 * magnitude2) return similarity def foo(files, JD): # Extract text and compute embeddings for job description using different models text_jd = extract_text_from_pdf(JD) JD_embedding_bert = get_bert_embedding(text_jd).flatten() # Flatten to match the dimension JD_embedding_longformer = get_longformer_embedding(text_jd).flatten() JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten() sim = [] for d in files: text = extract_text_from_pdf(d) # Compute embeddings for the resume using different models resume_embedding_bert = get_bert_embedding(text).flatten() # Fixed function call resume_embedding_longformer = get_longformer_embedding(text).flatten() resume_embedding_bigbird = get_bigbird_embedding(text).flatten() # Calculate cosine similarity for each model similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert) similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer) similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird) # Append the results to the array sim.append(f"\nFile: {d.name:}\n" f"Bert Similarity: {similarity_bert:.4f}\n" f"Longformer Similarity: {similarity_longformer:.4f}\n" f"BigBird Similarity: {similarity_bigbird:.4f}\n") return "\n".join(sim) # Join the list into a single string for Gradio output with gr.Blocks() as func: inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")] outputs = gr.Textbox(label="Similarity Scores") show = gr.Button(value="Calculate Similarity") show.click(foo, inputs, outputs) func.launch()