File size: 1,777 Bytes
67a91b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import requests
from typing import List, Dict, Union
import time
import mimetypes
from langchain_core.documents.base import Document


def parse_file(api_key, file_path) -> str:
    headers = {"Authorization": f"Bearer {api_key}"}
    base_url = "https://api.cloud.llamaindex.ai/api/parsing"

    with open(file_path, "rb") as f:
        mime_type = mimetypes.guess_type(file_path)[0]
        files = {"file": (f.name, f, mime_type)}

        # send the request, upload the file
        url = f"{base_url}/upload"
        response = requests.post(url, headers=headers, files=files)

    response.raise_for_status()
    # get the job id for the result_url
    job_id = response.json()["id"]
    result_type = "text"  # or "markdown"
    result_url = f"{base_url}/job/{job_id}/result/{result_type}"

    # check for the result until its ready
    while True:
        response = requests.get(result_url, headers=headers)
        if response.status_code == 200:
            break

        time.sleep(1)

    # download the result
    result = response.json()
    output = result[result_type]

    return output


def get_paged_text(text: str, separator: str = "\n---\n") -> List[str]:
    """Split each document into page node, by separator."""
    pages = []
    doc_chunks = text.split(separator)
    for doc_chunk in doc_chunks:
        if doc_chunk:
            pages.append(doc_chunk)

    return pages


def auth_user(username, password):
    return os.environ["USERNAME"] == username and os.environ["PASSWORD"] == password


def convert_to_docs(
    docs_str: List[str], metadata: Dict[str, Union[str, int, float]]
) -> List[Document]:
    docs = []
    for doc in docs_str:
        docs.append(Document(page_content=doc, metadata=metadata.copy()))
    return docs