resume-revealer / utils.py
astro21's picture
Update utils.py
1b6cc04 verified
raw
history blame contribute delete
No virus
4.11 kB
import os
import shutil
from dedoc import DedocManager
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from ResumeStructure import ResumeStructure
from fastapi import UploadFile
from prompt_template import template_format_instructions, template
from typing import List
# Create a directory to store temporary files
TEMP_DIR = "/temp_files"
# if not os.path.exists(TEMP_DIR):
# os.makedirs(TEMP_DIR)
async def process_file_with_dedoc(file: UploadFile):
"""
Process the file using Dedoc and return the output data.
Args:
- file: The UploadedFile object to be processed.
Returns:
- Output data if the file is processed successfully, None otherwise.
"""
manager = DedocManager()
supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']
print(f"Processing file '{file.filename}'...")
# Save the uploaded file to a temporary directory
file_path = os.path.join(TEMP_DIR, file.filename)
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Extract file extension from the file name
file_name, file_extension = os.path.splitext(file.filename)
file_extension = file_extension[1:].lower() # Remove the leading dot and convert to lowercase
# Check if the file extension is supported
if file_extension not in supported_formats:
print(f"Cannot process file '{file.filename}'. Unsupported file format.")
return None
# Process the file using Dedoc
output = manager.parse(file_path)
output_data = output.to_api_schema().model_dump()
# Remove the temporary file
os.remove(file_path)
return output_data
async def extract_text_from_all_levels(data):
"""
Extract text from all levels of subparagraphs in the JSON data.
Args:
- data: The JSON data containing subparagraphs.
Returns:
- A string containing the text from all levels of subparagraphs.
"""
text = ""
if 'subparagraphs' in data['content']['structure']:
subparagraphs = data['content']['structure']['subparagraphs']
text += await extract_text_from_subparagraphs(subparagraphs)
return text
async def extract_text_from_subparagraphs(subparagraphs):
"""
Recursively extract text from subparagraphs.
Args:
- subparagraphs: A list of subparagraphs.
Returns:
- A string containing the text from all subparagraphs.
"""
text = ""
for subpara in subparagraphs:
text += subpara['text'] + "\n"
if 'subparagraphs' in subpara:
text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
return text
def generate_formatted_resume(resume, chat_llm):
prompt = PromptTemplate(
template=template,
input_variables=["text"],
)
chain = prompt | chat_llm
result = chain.invoke({"text": resume})
return result.content
def generate_json_structured_resume(resume, chat_llm):
parser = JsonOutputParser(pydantic_object=ResumeStructure)
prompt = PromptTemplate(
template=template_format_instructions,
input_variables=["text"],
partial_variables={"format_instructions": parser.get_format_instructions()}
)
chain = prompt | chat_llm | parser
result = chain.invoke({"text": resume})
return result
def delete_files_in_directory(directory):
"""
Deletes all files in the specified directory.
Args:
directory (str): The path to the directory containing files to be deleted.
Returns:
None
"""
# Check if the directory exists
if not os.path.exists(directory):
print(f"Directory '{directory}' does not exist.")
return
# Get a list of all files in the directory
files = os.listdir(directory)
# Iterate over each file and delete it
for file in files:
file_path = os.path.join(directory, file)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"Deleted file: {file_path}")