Spaces:

Hushh
/

hushh-jobs-v1

Running

App Files Files Community

Dev Paragiri commited on Feb 7, 2024

Commit

39ea97d

1 Parent(s): 913f46e

hushh jobs v1

Browse files

Files changed (11) hide show

.gitattributes +0 -35
.gitignore +133 -0
README.md +4 -11
candidate.py +24 -0
embeddings.py +11 -0
llm_config.py +20 -0
main.py +124 -0
rank.py +53 -0
requirements.txt +123 -0
shortlisted.csv +2 -0
template.py +25 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,133 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# Ignore VSCode settings
+.vscode/

README.md CHANGED Viewed

@@ -1,12 +1,5 @@
----
-title: Hushh Jobs V1
-emoji: 🚀
-colorFrom: red
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.31.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# hushh-jobs
+The Resume Shortlisting Tool is a project designed to streamline the end to end hiring process.
+It takes multiple resumes and a job descrition as input and shortlists resumes based on the job descrition. It returns a downloadable csv file with structured details of shortlisted candidates.

candidate.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import List
+from langchain_core.pydantic_v1 import BaseModel, Field
+class Candidate(BaseModel):
+    name: str = Field(description="First name and last name of the candidate.")
+    email: str = Field(description="Email address of the candidate.")
+    phone: str = Field(description="Contact number with country code of the candidate.")
+    location: str = Field(description="City and state where the candidate resides.")
+    degree: List[str] = Field(description="List of the candidate's college degrees.")
+    college: List[str] = Field(description="List of all the colleges candidate went to")
+    skills: List[str] = Field(description="List of technical skills of the user.")
+    companies: List[str] = Field(
+        description="List only the name of the companies the user has worked at."
+    )
+    roles: List[str] = Field(
+        description="List all the job roles of the user at previous companies."
+    )
+    degree_year: int = Field(
+        description="The year in which candidate completed their degree."
+    )
+    experience: float = Field(
+        description="Number of years of professional experience of the candidate"
+    )

embeddings.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import requests
+import os
+os.environ["HF_TOKEN"]
+model_id = "sentence-transformers/all-MiniLM-L6-v2"
+hf_token = os.environ.get('HF_TOKEN')
+api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
+headers = {"Authorization": f"Bearer {hf_token}"}
+def text_embedding(texts):
+    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
+    return response.json()

llm_config.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from langchain.output_parsers import PydanticOutputParser
+from langchain.prompts import PromptTemplate
+from langchain.llms import OpenAI
+from candidate import Candidate
+import os
+def instantiate_llm():
+    model_name = "gpt-3.5-turbo-instruct"
+    temperature = 0.0
+    model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=600)
+    parser = PydanticOutputParser(pydantic_object=Candidate)
+    prompt = PromptTemplate(
+        template="Answer the user query.\n{format_instructions}\n{query}\n",
+        input_variables=["query"],
+        partial_variables={"format_instructions": parser.get_format_instructions()},
+    )
+    return model, prompt

main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import csv
+import json
+import streamlit as st
+from PyPDF2 import PdfReader
+from llm_config import instantiate_llm
+from langchain.callbacks import get_openai_callback
+from langchain.llms import OpenAI
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain.output_parsers import PydanticOutputParser
+from template import prompt_template
+import pandas as pd
+from candidate import Candidate
+import logging
+from rank import extract_and_rank
+def extract_resume(resume):
+    reader = PdfReader(resume)
+    return "".join(page.extract_text() for page in reader.pages)
+def main():
+    st.set_page_config(layout="wide", page_title="Hushh Jobs")
+    st.header("Hushh Jobs")
+    model, prompt = instantiate_llm()
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        resumes = st.file_uploader(
+            "Upload resumes here!", accept_multiple_files=True, type="pdf"
+        )
+    with col2:
+        no_of_resumes = st.number_input("Enter the number of resumes you want to shortlist",step=1)
+    with col3:
+        job_description = st.text_area("Enter the job description here!", height=250)
+        rank_btn = st.button("Rank")
+    if resumes and rank_btn:
+        if len(job_description) < 1:
+            st.warning(
+                "Invalid or Empty job description! Please make sure your job description has atleast 25 characters!"
+            )
+        else:
+            dict_object = {}
+            rows = []
+            ranked_resumes, embeddings_bank, text_bank = extract_and_rank(
+                resumes, job_description
+            )
+            no_of_resumes=int(no_of_resumes)
+            for selected_resume in ranked_resumes[:no_of_resumes]:
+                resume_text = text_bank[selected_resume[0]]
+                doc_query = f"Return only a json based on this candidate's resume information: {resume_text}"
+                input = prompt.format_prompt(query=doc_query)
+                #using PydanticOutputParser for structuring language model responses into a coherent, JSON-like format.
+                parser = PydanticOutputParser(pydantic_object=Candidate)
+                with get_openai_callback() as cb:
+                    try:
+                        result = model(input.to_string())
+                        st.success(result)
+                        class_object= parser.parse(result)  #using the above defined pydantic output parser to structure the response in a json-format
+                        dict_object=class_object.__dict__
+                        #dict_object = json.loads(result)
+                        rows.append(dict_object)
+                    except Exception as error:
+                        print(error)
+            field_names = [
+                "name",
+                "email",
+                "phone",
+                "location",
+                "degree",
+                "college",
+                "skills",
+                "companies",
+                "roles",
+                "degree_year",
+                "experience",
+            ]
+            user_csv = "shortlisted.csv"
+            write_csv(user_csv=user_csv, field_names=field_names, rows=rows)
+            df = pd.read_csv(user_csv)
+            st.dataframe(df)
+#def write_csv(user_csv, field_names, rows):
+def write_csv(user_csv, field_names, rows):
+    with open(user_csv, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=field_names)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+def write_response(user_csv, response: str):
+    """
+    Write a response from an agent to a Streamlit app.
+    Args:
+        response_dict: The response from the agent.
+    Returns:
+        None.
+    """
+    df = pd.read_csv(user_csv)
+    data = eval(response)
+    st.dataframe(data=data, use_container_width=True)
+if __name__ == "__main__":
+    main()

rank.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from PyPDF2 import PdfReader
+import streamlit as st
+from embeddings import text_embedding
+import scipy
+def extract_and_rank(resumes, job_description):
+    out_embed_dict = {}
+    out_text_dict = {}
+    for resume in resumes:
+        reader = PdfReader(resume)
+        raw_text = "".join(page.extract_text() for page in reader.pages)
+        embedding = text_embedding(raw_text)
+        d1 = {resume.name: (embedding)}
+        d2 = {resume.name: raw_text}
+        out_embed_dict.update(d1)
+        out_text_dict.update(d2)
+    ranked_output = rankings(out_dict=out_embed_dict, query=job_description)
+    return ranked_output, out_embed_dict, out_text_dict
+def get_sim(query_embedding, average_vec):
+    try:
+        sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vec))]
+        return sim
+    except:
+        return [0]
+def rankings(out_dict, query):
+    query_embedding = text_embedding(query)
+    rank = []
+    for k, v in out_dict.items():
+        rank.append((k, get_sim(query_embedding, v)))
+    rank = sorted(rank, key=lambda t: t[1], reverse=True)
+    return rank
+# def data_clean(text):
+#     pattern = r'[^a-zA-Z0-9\s]'
+#     text = re.sub(pattern,'',' '.join(text))
+#     tokens = [token.strip() for token in text.split()]
+#     filtered = [token for token in tokens if token.lower() not in stopword_list]
+#     filtered = ' '.join(filtered)
+#     return filtered
+# def embeddings(word):
+#     # print(word)
+#     if word in wv.key_to_index:
+#         return wv.get_vector(word)
+#     else:
+#         return np.zeros(300)

requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+arrow==1.3.0
+attrs==23.1.0
+av==10.0.0
+black==23.12.1
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+ctranslate2==3.18.0
+dataclasses-json==0.6.3
+distro==1.8.0
+faster-whisper==0.7.1
+filelock==3.12.2
+Flask==2.3.3
+Flask-Cors==4.0.0
+flatbuffers==23.5.26
+frozenlist==1.4.1
+fsspec==2023.6.0
+gensim==4.3.2
+gitdb==4.0.11
+GitPython==3.1.40
+google-api-core==2.14.0
+google-auth==2.23.4
+google-cloud==0.34.0
+google-cloud-core==2.3.3
+google-cloud-speech==2.22.0
+google-cloud-storage==2.13.0
+google-crc32c==1.5.0
+google-resumable-media==2.6.0
+googleapis-common-protos==1.61.0
+greenlet==3.0.1
+grpcio==1.59.3
+grpcio-status==1.59.3
+gunicorn==21.2.0
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.16.4
+humanfriendly==10.0
+idna==3.6
+importlib-metadata==6.11.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jinja2-time==0.2.0
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+langchain==0.0.352
+langchain-community==0.0.5
+langchain-core==0.1.2
+langsmith==0.0.72
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+mypy-extensions==1.0.0
+nltk==3.8.1
+numpy==1.26.2
+onnxruntime==1.15.1
+openai==1.6.0
+packaging==23.2
+pandas==2.1.4
+pathspec==0.12.1
+Pillow==10.1.0
+platformdirs==4.1.0
+proto-plus==1.22.3
+protobuf==4.25.1
+pyarrow==14.0.2
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydeck==0.8.1b0
+Pygments==2.17.2
+pynvim==0.4.3
+PyPDF2==3.0.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.15.2
+rsa==4.9
+scipy==1.11.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.0
+SQLAlchemy==2.0.23
+streamlit==1.29.0
+sympy==1.12
+tenacity==8.2.3
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.0
+tornado==6.4
+tqdm==4.66.1
+types-python-dateutil==2.8.19.14
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+watchdog==3.0.0
+Werkzeug==2.3.7
+yarl==1.9.4
+zipp==3.17.0

shortlisted.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name,email,phone,location,degree,college,skills,companies,roles,degree_year,experience
2	+ Ayushi Bhatnagar,[email protected],9873047199,"Ghaziabad, U.P.",['B.Tech'],['Krishna Engineering College(AKTU)'],"['Swift', 'Dart', 'Core Java', 'SwiftUI', 'UIkit', 'Flutter', 'MSSQL', 'Xcode', 'Android Studio', 'Postman', 'Gitlab', 'Github Desktop', 'Zeplin', 'Figma', 'MS Word', 'MS Excel', 'Google Sheets']","['Augurs Technologies', 'Innefu Labs', 'Devarty Technologies']","['Executive (iOS)', 'Software Engineer(iOS)']",2020,2.0

template.py ADDED Viewed

	@@ -0,0 +1,25 @@

+prompt_template = """
+Question: {question}\
+The response must ONLY contain the code snippet and NOTHING else.
+The response must be one single line which contains only the query and must not be assigned to a variable.
+Make sure you follow the instructions/thought process below.
+Return a pandas DF query based on the question and CSV file schema below.
+Instructions:
+Make sure that the pandas query always accounts for search results which are very similar to the one asked in the question.
+Example 1:
+Question: Candidates who have worked at a bank
+df[df['companies'].str.contains('bank', case=False, na=False)]
+Example 2:
+Question: Candidates from Gurgaon
+df[df['location'].str.contains('Gurgaon', case=False, na=False)]
+CSV file schema:
+You have access to a resume candidates CSV file which has the name, email, location, degree, college, skills, companies, roles, degree_year, and experience as columns.
+"""