File size: 3,428 Bytes
da7be98
 
 
 
 
 
 
 
 
 
 
 
ad8446f
 
 
da7be98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import shutil
from dedoc import DedocManager
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from ResumeStructure import ResumeStructure
from fastapi import UploadFile
from prompt_template import template_format_instructions, template
from typing import List

# Create a directory to store temporary files
TEMP_DIR = "/code/temp_files"
# if not os.path.exists(TEMP_DIR):
#     os.makedirs(TEMP_DIR)


async def process_file_with_dedoc(file: UploadFile):
    """
    Process the file using Dedoc and return the output data.

    Args:
    - file: The UploadedFile object to be processed.

    Returns:
    - Output data if the file is processed successfully, None otherwise.
    """
    manager = DedocManager()

    supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']

    print(f"Processing file '{file.filename}'...")

    # Save the uploaded file to a temporary directory
    file_path = os.path.join(TEMP_DIR, file.filename)

    with open(file_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)

    # Extract file extension from the file name
    file_name, file_extension = os.path.splitext(file.filename)
    file_extension = file_extension[1:].lower()  # Remove the leading dot and convert to lowercase

    # Check if the file extension is supported
    if file_extension not in supported_formats:
        print(f"Cannot process file '{file.filename}'. Unsupported file format.")
        return None

    # Process the file using Dedoc
    output = manager.parse(file_path)
    output_data = output.to_api_schema().model_dump()

    # Remove the temporary file
    os.remove(file_path)

    return output_data


async def extract_text_from_all_levels(data):
    """
    Extract text from all levels of subparagraphs in the JSON data.

    Args:
    - data: The JSON data containing subparagraphs.

    Returns:
    - A string containing the text from all levels of subparagraphs.
    """
    text = ""

    if 'subparagraphs' in data['content']['structure']:
        subparagraphs = data['content']['structure']['subparagraphs']
        text += await extract_text_from_subparagraphs(subparagraphs)
    return text


async def extract_text_from_subparagraphs(subparagraphs):
    """
    Recursively extract text from subparagraphs.

    Args:
    - subparagraphs: A list of subparagraphs.

    Returns:
    - A string containing the text from all subparagraphs.
    """
    text = ""
    for subpara in subparagraphs:
        text += subpara['text'] + "\n"
        if 'subparagraphs' in subpara:
            text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
    return text


def generate_formatted_resume(resume, chat_llm):
    prompt = PromptTemplate(
        template=template,
        input_variables=["text"],
    )
    chain = prompt | chat_llm

    result = chain.invoke({"text": resume})

    return result.content


def generate_json_structured_resume(resume, chat_llm):
    parser = JsonOutputParser(pydantic_object=ResumeStructure)

    prompt = PromptTemplate(
        template=template_format_instructions,
        input_variables=["text"],
        partial_variables={"format_instructions": parser.get_format_instructions()}
    )
    chain = prompt | chat_llm | parser

    result = chain.invoke({"text": resume})

    return result