Spaces:

astro21
/

resume-revealer

Sleeping

App Files Files Community

resume-revealer / utils.py

astro21

Update utils.py

ad8446f verified 5 months ago

raw

history blame

3.43 kB

	import os
	import shutil
	from dedoc import DedocManager
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser
	from ResumeStructure import ResumeStructure
	from fastapi import UploadFile
	from prompt_template import template_format_instructions, template
	from typing import List

	# Create a directory to store temporary files
	TEMP_DIR = "/code/temp_files"
	# if not os.path.exists(TEMP_DIR):
	# os.makedirs(TEMP_DIR)


	async def process_file_with_dedoc(file: UploadFile):
	"""
	Process the file using Dedoc and return the output data.

	Args:
	- file: The UploadedFile object to be processed.

	Returns:
	- Output data if the file is processed successfully, None otherwise.
	"""
	manager = DedocManager()

	supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']

	print(f"Processing file '{file.filename}'...")

	# Save the uploaded file to a temporary directory
	file_path = os.path.join(TEMP_DIR, file.filename)

	with open(file_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	# Extract file extension from the file name
	file_name, file_extension = os.path.splitext(file.filename)
	file_extension = file_extension[1:].lower() # Remove the leading dot and convert to lowercase

	# Check if the file extension is supported
	if file_extension not in supported_formats:
	print(f"Cannot process file '{file.filename}'. Unsupported file format.")
	return None

	# Process the file using Dedoc
	output = manager.parse(file_path)
	output_data = output.to_api_schema().model_dump()

	# Remove the temporary file
	os.remove(file_path)

	return output_data


	async def extract_text_from_all_levels(data):
	"""
	Extract text from all levels of subparagraphs in the JSON data.

	Args:
	- data: The JSON data containing subparagraphs.

	Returns:
	- A string containing the text from all levels of subparagraphs.
	"""
	text = ""

	if 'subparagraphs' in data['content']['structure']:
	subparagraphs = data['content']['structure']['subparagraphs']
	text += await extract_text_from_subparagraphs(subparagraphs)
	return text


	async def extract_text_from_subparagraphs(subparagraphs):
	"""
	Recursively extract text from subparagraphs.

	Args:
	- subparagraphs: A list of subparagraphs.

	Returns:
	- A string containing the text from all subparagraphs.
	"""
	text = ""
	for subpara in subparagraphs:
	text += subpara['text'] + "\n"
	if 'subparagraphs' in subpara:
	text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
	return text


	def generate_formatted_resume(resume, chat_llm):
	prompt = PromptTemplate(
	template=template,
	input_variables=["text"],
	)
	chain = prompt \| chat_llm

	result = chain.invoke({"text": resume})

	return result.content


	def generate_json_structured_resume(resume, chat_llm):
	parser = JsonOutputParser(pydantic_object=ResumeStructure)

	prompt = PromptTemplate(
	template=template_format_instructions,
	input_variables=["text"],
	partial_variables={"format_instructions": parser.get_format_instructions()}
	)
	chain = prompt \| chat_llm \| parser

	result = chain.invoke({"text": resume})

	return result