Spaces:
Running
on
Zero
Running
on
Zero
# from pdfminer.high_level import extract_pages | |
# from pdfminer.layout import LTTextContainer | |
from tqdm import tqdm | |
import re | |
import gradio as gr | |
import os | |
import accelerate | |
import spaces | |
import subprocess | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
from docling.document_converter import DocumentConverter | |
from huggingface_hub import login | |
login(token = os.getenv('HF_TOKEN')) | |
repo_id = "srijaydeshpande/CVCRaft" | |
model_id = "cvcraft2.gguf" | |
hf_hub_download( | |
repo_id=repo_id, | |
filename=model_id, | |
local_dir = "./models" | |
) | |
def process_document(pdf_path): | |
extracted_pages = extract_pages(pdf_path) | |
page2content = {} | |
for extracted_page in tqdm(extracted_pages): | |
page_id = extracted_page.pageid | |
content = process_page(extracted_page) | |
page2content[page_id] = content | |
return page2content | |
def process_page(extracted_page): | |
content = [] | |
elements = [element for element in extracted_page._objs] | |
elements.sort(key=lambda a: a.y1, reverse=True) | |
for i, element in enumerate(elements): | |
if isinstance(element, LTTextContainer): | |
line_text = extract_text_and_normalize(element) | |
content.append(line_text) | |
content = re.sub('\n+', '\n', ''.join(content)) | |
return content | |
def extract_text_and_normalize(element): | |
# Extract text from line and split it with new lines | |
line_texts = element.get_text().split('\n') | |
norm_text = '' | |
for line_text in line_texts: | |
line_text = line_text.strip() | |
if not line_text: | |
line_text = '\n' | |
else: | |
line_text = re.sub('\s+', ' ', line_text) | |
if not re.search('[\w\d\,\-]', line_text[-1]): | |
line_text += '\n' | |
else: | |
line_text += ' ' | |
norm_text += line_text | |
return norm_text | |
def txt_to_html(text): | |
html_content = "<html><body>" | |
for line in text.split('\n'): | |
html_content += "<p>{}</p>".format(line.strip()) | |
html_content += "</body></html>" | |
return html_content | |
# def craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability): | |
def craft_cv(llm, prompt, maxtokens, temperature, top_probability): | |
instruction = "Given input CV and job description. Please revise the CV according to the given job description and output the revised CV." | |
output = llm.create_chat_completion( | |
messages=[ | |
# {"from": "user", "value": instruction + ' Input CV: ' + cv_text + ' , Job Description: ' + job_description}, | |
{"from": "user", "value": prompt}, | |
], | |
max_tokens=maxtokens, | |
temperature=temperature | |
) | |
output = output['choices'][0]['message']['content'] | |
cv_text='' | |
return cv_text, output | |
def convert_to_json(llm, cv_text, maxtokens, temperature, top_probability): | |
json_format = """ | |
You are an expert at structuring resumes in JSON format. Given a modified resume text, extract the relevant details and convert them into the following structured JSON format: | |
{ | |
"profileDetail": { | |
"name": "[Candidate's Name]", | |
"email": "[Candidate's Email]", | |
"phone": "[Candidate's Phone]", | |
"linkedin": "[Candidate's LinkedIn]", | |
"languages": "Hindi,English", | |
"interests": "Cricket", | |
"location": "[Candidate's Location]", | |
"role": "[Candidate's Role]" | |
}, | |
"professionalSummary": "[Candidate's Professional Summary]", | |
"skills": ["skill1", "skill2"], | |
"workExperience": [ | |
{ | |
"title": "[Job Title]", | |
"company": "[Company Name]", | |
"location": "[Location]", | |
"startDate": "[Start Date]", | |
"endDate": "[End Date]", | |
"responsibilities": ["[Responsibility 1]", "[Responsibility 2]"], | |
"projects": [ | |
{ | |
"title": "[Project Title]", | |
"description": "[Project Description]" | |
} | |
] | |
} | |
], | |
"education": [ | |
{ | |
"degree": "[Degree]", | |
"institution": "[Institution]", | |
"location": "[Location]", | |
"graduationDate": "[Graduation Date]" | |
} | |
], | |
"certifications": ["[Certification 1]", "[Certification 2]"] , | |
"extraCurricular": "[Extra Curricular Activities]", | |
"achievment": "[Achievements]" | |
} | |
Instructions: | |
- Extract details accurately from the given resume. | |
- Ensure proper structuring of dates, responsibilities, and projects. | |
- If a field is missing in the input, leave it as an empty string or an empty list where applicable. | |
- Maintain proper formatting and avoid unnecessary additions. | |
Provide the response in a valid JSON format with no additional explanations. | |
""" | |
output = llm.create_chat_completion( | |
messages=[ | |
{"from": "user", "value": json_format + ' CV text: ' + cv_text}, | |
# {"from": "user", "value": prompt + ' CV text: ' + cv_text}, | |
], | |
max_tokens=maxtokens, | |
temperature=temperature | |
) | |
output = output['choices'][0]['message']['content'] | |
return output | |
def pdf_to_text(prompt, maxtokens=2048, temperature=0, top_probability=0.95): | |
# def pdf_to_text(cv_file, job_description, maxtokens=2048, temperature=0, top_probability=0.95): | |
# page2content = process_document(cv_file) | |
# cv_text = "" | |
# for page_id in page2content: | |
# cv_text += page2content[page_id] + ' ' | |
# converter = DocumentConverter() | |
# result = converter.convert(cv_file) | |
# cv_text = result.document.export_to_markdown() | |
llm = Llama( | |
model_path="models/" + model_id, | |
flash_attn=True, | |
n_gpu_layers=81, | |
n_batch=1024, | |
n_ctx=8192, | |
) | |
# cv_text, crafted_cv = craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability) | |
cv_text, crafted_cv = craft_cv(llm, prompt, maxtokens, temperature, top_probability) | |
crafted_cv = convert_to_json(llm, crafted_cv, maxtokens, temperature, top_probability) | |
return crafted_cv | |
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value") | |
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value") | |
max_tokens = gr.Number(value=600, label="Max Tokens") | |
cv_file = gr.File(label='Upload the CV') | |
prompt_text = gr.Textbox(label='Enter the job description') | |
output_text = gr.Textbox() | |
iface = gr.Interface( | |
fn=pdf_to_text, | |
# inputs=[cv_file, prompt_text], | |
inputs=['text'], | |
outputs=['text'], | |
title='Craft CV', | |
description="This application assists to customize CV based on input job description", | |
theme=gr.themes.Soft(), | |
) | |
iface.launch() |