ahmadfareedsukhera commited on
Commit
7ed0298
·
verified ·
1 Parent(s): 8bf4208

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from transformers import BertTokenizer, BertModel
4
+ from transformers import LongformerModel, LongformerTokenizer
5
+ from transformers import BigBirdModel, BigBirdTokenizer
6
+ import numpy as np
7
+ from groq import Groq
8
+ import gradio as gr
9
+ from pathlib import Path
10
+ import torch
11
+
12
+
13
+ # Load BERT tokenizer and model
14
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
15
+ model = BertModel.from_pretrained('bert-base-uncased')
16
+
17
+ # Load the BigBird model and tokenizer
18
+ tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
19
+ model = BigBirdModel.from_pretrained('google/bigbird-roberta-base')
20
+
21
+
22
+ #longformer
23
+ # Load the Longformer model and tokenizer
24
+ tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
25
+ model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
26
+
27
+ #longFormer
28
+
29
+ def get_longformer_embedding(text):
30
+ # Tokenize the text
31
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
32
+
33
+ # Get the embeddings from Longformer
34
+ with torch.no_grad():
35
+ outputs = model(**inputs)
36
+
37
+ # Use the [CLS] token's embedding as the aggregate representation
38
+ cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
39
+
40
+ return cls_embedding
41
+
42
+ # BIGBIRD
43
+ def get_bigbird_embedding(text):
44
+ # Tokenize the text
45
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
46
+
47
+ # Get the embeddings from BigBird
48
+ with torch.no_grad():
49
+ outputs = model(**inputs)
50
+
51
+ # Use the [CLS] token's embedding as the aggregate representation
52
+ cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
53
+
54
+ return cls_embedding
55
+
56
+ def get_bert_embedding(text):
57
+ # Tokenize the text
58
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
59
+
60
+ # Get the embeddings from BERT
61
+ with torch.no_grad():
62
+ outputs = model(**inputs)
63
+
64
+ # Use the [CLS] token's embedding as the aggregate representation
65
+ cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
66
+
67
+ return cls_embedding
68
+ def process_folder(file):
69
+ folder_path = os.path.dirname(file.name) # Get the directory of the selected file
70
+ files = os.listdir(folder_path) # List all files in the directory
71
+ file_paths = [os.path.join(folder_path, f) for f in files] # Get full paths of all files
72
+ return f"Files in folder: {', '.join(files)}"
73
+
74
+ # Function to extract text from a PDF
75
+ def extract_text_from_pdf(pdf_file):
76
+ text = ''
77
+ with open(pdf_file, 'rb') as file:
78
+ reader = PyPDF2.PdfReader(file)
79
+ for page in reader.pages:
80
+ text += page.extract_text() or ''
81
+ return text
82
+
83
+ def calculate_cosine(embedding1, embedding2):
84
+ # Calculate the dot product and magnitudes of the embeddings
85
+ dot_product = np.dot(embedding1, embedding2)
86
+ magnitude1 = np.linalg.norm(embedding1)
87
+ magnitude2 = np.linalg.norm(embedding2)
88
+
89
+ # Calculate cosine similarity
90
+ similarity = dot_product / (magnitude1 * magnitude2)
91
+ return similarity
92
+ def foo(files, JD):
93
+ # Extract text and compute embeddings for job description using different models
94
+ text_jd = extract_text_from_pdf(JD)
95
+ JD_embedding_bert = get_bert_embedding(text_jd).flatten() # Flatten to match the dimension
96
+ JD_embedding_longformer = get_longformer_embedding(text_jd).flatten()
97
+ JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten()
98
+
99
+ sim = []
100
+
101
+ for d in files:
102
+ text = extract_text_from_pdf(d)
103
+ # Compute embeddings for the resume using different models
104
+ resume_embedding_bert = get_bert_embedding(text).flatten() # Fixed function call
105
+ resume_embedding_longformer = get_longformer_embedding(text).flatten()
106
+ resume_embedding_bigbird = get_bigbird_embedding(text).flatten()
107
+ # Calculate cosine similarity for each model
108
+ similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert)
109
+ similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer)
110
+ similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird)
111
+ # Append the results to the array
112
+ sim.append(f"\nFile: {d.name:}\n"
113
+ f"Bert Similarity: {similarity_bert:.4f}\n"
114
+ f"Longformer Similarity: {similarity_longformer:.4f}\n"
115
+ f"BigBird Similarity: {similarity_bigbird:.4f}\n")
116
+
117
+
118
+
119
+ return "\n".join(sim) # Join the list into a single string for Gradio output
120
+
121
+
122
+ with gr.Blocks() as func:
123
+ inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")]
124
+ outputs = gr.Textbox(label="Similarity Scores")
125
+ show = gr.Button(value="Calculate Similarity")
126
+ show.click(foo, inputs, outputs)
127
+
128
+ func.launch()