CVCraft / app.py
srijaydeshpande's picture
Update app.py
5c7d67b verified
# from pdfminer.high_level import extract_pages
# from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re
import gradio as gr
import os
import accelerate
import spaces
import subprocess
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from docling.document_converter import DocumentConverter
from huggingface_hub import login
login(token = os.getenv('HF_TOKEN'))
repo_id = "srijaydeshpande/CVCRaft"
model_id = "cvcraft2.gguf"
hf_hub_download(
repo_id=repo_id,
filename=model_id,
local_dir = "./models"
)
def process_document(pdf_path):
extracted_pages = extract_pages(pdf_path)
page2content = {}
for extracted_page in tqdm(extracted_pages):
page_id = extracted_page.pageid
content = process_page(extracted_page)
page2content[page_id] = content
return page2content
def process_page(extracted_page):
content = []
elements = [element for element in extracted_page._objs]
elements.sort(key=lambda a: a.y1, reverse=True)
for i, element in enumerate(elements):
if isinstance(element, LTTextContainer):
line_text = extract_text_and_normalize(element)
content.append(line_text)
content = re.sub('\n+', '\n', ''.join(content))
return content
def extract_text_and_normalize(element):
# Extract text from line and split it with new lines
line_texts = element.get_text().split('\n')
norm_text = ''
for line_text in line_texts:
line_text = line_text.strip()
if not line_text:
line_text = '\n'
else:
line_text = re.sub('\s+', ' ', line_text)
if not re.search('[\w\d\,\-]', line_text[-1]):
line_text += '\n'
else:
line_text += ' '
norm_text += line_text
return norm_text
def txt_to_html(text):
html_content = "<html><body>"
for line in text.split('\n'):
html_content += "<p>{}</p>".format(line.strip())
html_content += "</body></html>"
return html_content
# def craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability):
def craft_cv(llm, prompt, maxtokens, temperature, top_probability):
instruction = "Given input CV and job description. Please revise the CV according to the given job description and output the revised CV."
output = llm.create_chat_completion(
messages=[
# {"from": "user", "value": instruction + ' Input CV: ' + cv_text + ' , Job Description: ' + job_description},
{"from": "user", "value": prompt},
],
max_tokens=maxtokens,
temperature=temperature
)
output = output['choices'][0]['message']['content']
cv_text=''
return cv_text, output
def convert_to_json(llm, cv_text, maxtokens, temperature, top_probability):
json_format = """
You are an expert at structuring resumes in JSON format. Given a modified resume text, extract the relevant details and convert them into the following structured JSON format:
{
"profileDetail": {
"name": "[Candidate's Name]",
"email": "[Candidate's Email]",
"phone": "[Candidate's Phone]",
"linkedin": "[Candidate's LinkedIn]",
"languages": "Hindi,English",
"interests": "Cricket",
"location": "[Candidate's Location]",
"role": "[Candidate's Role]"
},
"professionalSummary": "[Candidate's Professional Summary]",
"skills": ["skill1", "skill2"],
"workExperience": [
{
"title": "[Job Title]",
"company": "[Company Name]",
"location": "[Location]",
"startDate": "[Start Date]",
"endDate": "[End Date]",
"responsibilities": ["[Responsibility 1]", "[Responsibility 2]"],
"projects": [
{
"title": "[Project Title]",
"description": "[Project Description]"
}
]
}
],
"education": [
{
"degree": "[Degree]",
"institution": "[Institution]",
"location": "[Location]",
"graduationDate": "[Graduation Date]"
}
],
"certifications": ["[Certification 1]", "[Certification 2]"] ,
"extraCurricular": "[Extra Curricular Activities]",
"achievment": "[Achievements]"
}
Instructions:
- Extract details accurately from the given resume.
- Ensure proper structuring of dates, responsibilities, and projects.
- If a field is missing in the input, leave it as an empty string or an empty list where applicable.
- Maintain proper formatting and avoid unnecessary additions.
Provide the response in a valid JSON format with no additional explanations.
"""
output = llm.create_chat_completion(
messages=[
{"from": "user", "value": json_format + ' CV text: ' + cv_text},
# {"from": "user", "value": prompt + ' CV text: ' + cv_text},
],
max_tokens=maxtokens,
temperature=temperature
)
output = output['choices'][0]['message']['content']
return output
@spaces.GPU(duration=40)
def pdf_to_text(prompt, maxtokens=2048, temperature=0, top_probability=0.95):
# def pdf_to_text(cv_file, job_description, maxtokens=2048, temperature=0, top_probability=0.95):
# page2content = process_document(cv_file)
# cv_text = ""
# for page_id in page2content:
# cv_text += page2content[page_id] + ' '
# converter = DocumentConverter()
# result = converter.convert(cv_file)
# cv_text = result.document.export_to_markdown()
llm = Llama(
model_path="models/" + model_id,
flash_attn=True,
n_gpu_layers=81,
n_batch=1024,
n_ctx=8192,
)
# cv_text, crafted_cv = craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability)
cv_text, crafted_cv = craft_cv(llm, prompt, maxtokens, temperature, top_probability)
crafted_cv = convert_to_json(llm, crafted_cv, maxtokens, temperature, top_probability)
return crafted_cv
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
max_tokens = gr.Number(value=600, label="Max Tokens")
cv_file = gr.File(label='Upload the CV')
prompt_text = gr.Textbox(label='Enter the job description')
output_text = gr.Textbox()
iface = gr.Interface(
fn=pdf_to_text,
# inputs=[cv_file, prompt_text],
inputs=['text'],
outputs=['text'],
title='Craft CV',
description="This application assists to customize CV based on input job description",
theme=gr.themes.Soft(),
)
iface.launch()