File size: 5,738 Bytes
22ffb5a
717f996
 
 
 
 
 
 
 
 
497f6e8
717f996
54a099a
4bd843b
717f996
 
497f6e8
 
4bd843b
717f996
 
4bd843b
717f996
4bd843b
717f996
 
 
4bd843b
717f996
 
 
 
 
4bd843b
717f996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54a099a
 
 
 
 
 
717f996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bd843b
717f996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bd843b
717f996
 
 
 
4bd843b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a65eea4
4bd843b
 
 
 
 
 
 
 
 
a65eea4
 
4bd843b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import pandas as pd
import spacy
from spacy import displacy
import plotly.express as px
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download(['stopwords','wordnet'])
nltk.download('omw-1.4')

# Load the CSV file into a DataFrame
dataset_path = "Resume.csv" 
df = pd.read_csv(dataset_path)
df= df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:500,]

# Load the spaCy English language model with large vocabulary and pre-trained word vectors
spacy_model = spacy.load("en_core_web_lg")

# Path to the file containing skill patterns in JSONL format (2129 skills)
skill_pattern_path = "jz_skill_patterns.jsonl"

# Add an entity ruler to the spaCy pipeline
ruler = spacy_model.add_pipe("entity_ruler")

# Load skill patterns from disk into the entity ruler
ruler.from_disk(skill_pattern_path)

def get_unique_skills(text):
    doc = spacy_model(text)
    skills = set()
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            skills.add(ent.text)
    return list(skills)

def preprocess_resume(resume_str):
    # Remove special characters, URLs, and Twitter mentions
    review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str)
    
    # Convert to lowercase and tokenize
    review = review.lower().split()
    
    # Lemmatize and remove stopwords
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    
    # Join the words back into a string
    review = " ".join(review)
    return review

# Apply the preprocess_resume function to each resume string and store the result in a new column
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)

# Extract skills from each preprocessed resume and store them in a new column
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)

def get_skills_distribution(Job_Category):
    if Job_Category != "ALL":
        filtered_data = data[data["Category"] == Job_Category]["skills"]
    else:
        filtered_data = data["skills"]

    total_skills = [skill for sublist in filtered_data for skill in sublist]

    fig = px.histogram(
        x=total_skills,
        labels={"x": "Skills"},
        title=f"{Job_Category} Distribution of Skills",
    ).update_xaxes(categoryorder="total descending")

    return fig.show()


# Apply the preprocess_resume function to each resume string and store the result in a new column
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)

# Extract skills from each preprocessed resume and store them in a new column
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)

patterns = data.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])


# Define the options for highlighting entities

options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
}

# Define a function to process the resume text and highlight entities
def highlight_entities(resume_text):
    # Process the resume text with spaCy
    doc = spacy_model(resume_text)
    # Render the entities with displacy and return the HTML
    html = displacy.render(doc, style="ent", options=options, jupyter=False)
    return html

def calculate_semantic_similarity(required_skills, resume_skills):
    """
    Calculate the semantic similarity between required skills and resume skills.
    """
    required_skills_str = " ".join(required_skills)
    resume_skills_str = " ".join(resume_skills)
    required_skills_doc = spacy_model(required_skills_str)
    resume_skills_doc = spacy_model(resume_skills_str)
    similarity_score = required_skills_doc.similarity(resume_skills_doc)
    return similarity_score

def find_matching_resumes(input_skills, n=5):
    """
    Find and rank the top matching resumes based on input skills.
    """
    req_skills = input_skills.lower().split(",")
    ranked_resumes = []
    for idx, row in data.iterrows():
        resume_skills = row['skills']
        similarity_score = calculate_semantic_similarity(req_skills, resume_skills)
        ranked_resumes.append((row['Resume_str'], similarity_score))  # Accessing 'resume_str' directly
    
    # Sort resumes by similarity scores in descending order
    ranked_resumes.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top N matching resumes
    top_matching_resumes = ranked_resumes[:n]
    
    # Construct output in a structured format
    output = []
    for resume_str, score in top_matching_resumes:
        output.append(f"Similarity Score: {score}\nResume: {resume_str}")  # Return 'resume_str' instead of 'resume_id'
    
    return output



with gr.Blocks() as demo:
    gr.Markdown("Enter your resume text and perform NER, or enter the required skills and find the top matching resumes.")
    with gr.Tab("Enter your resume text and perform NER"):
        text_input = gr.Textbox(lines=10, label="Input Resume Text")
        text_output = gr.HTML(label="Highlighted Entities")
        text_button = gr.Button("Submit")
    with gr.Tab("Enter the required skills (comma-separated) and find the top matching resumes."):
    
        text_input2 = gr.Textbox(lines=5, label="Input Required Skills (comma-separated)")
        text_output2 = gr.Textbox(label="Top Matching Resumes")
        text_button2 = gr.Button("Submit")

    text_button.click(highlight_entities, inputs=text_input, outputs=text_output)
    text_button2.click(find_matching_resumes, inputs=text_input2, outputs=text_output2)

demo.launch()