|
import os |
|
import requests |
|
from typing import List, Dict, Union |
|
import time |
|
import mimetypes |
|
from langchain_core.documents.base import Document |
|
|
|
|
|
def parse_file(api_key, file_path) -> str: |
|
headers = {"Authorization": f"Bearer {api_key}"} |
|
base_url = "https://api.cloud.llamaindex.ai/api/parsing" |
|
|
|
with open(file_path, "rb") as f: |
|
mime_type = mimetypes.guess_type(file_path)[0] |
|
files = {"file": (f.name, f, mime_type)} |
|
|
|
|
|
url = f"{base_url}/upload" |
|
response = requests.post(url, headers=headers, files=files) |
|
|
|
response.raise_for_status() |
|
|
|
job_id = response.json()["id"] |
|
result_type = "text" |
|
result_url = f"{base_url}/job/{job_id}/result/{result_type}" |
|
|
|
|
|
while True: |
|
response = requests.get(result_url, headers=headers) |
|
if response.status_code == 200: |
|
break |
|
|
|
time.sleep(1) |
|
|
|
|
|
result = response.json() |
|
output = result[result_type] |
|
|
|
return output |
|
|
|
|
|
def get_paged_text(text: str, separator: str = "\n---\n") -> List[str]: |
|
"""Split each document into page node, by separator.""" |
|
pages = [] |
|
doc_chunks = text.split(separator) |
|
for doc_chunk in doc_chunks: |
|
if doc_chunk: |
|
pages.append(doc_chunk) |
|
|
|
return pages |
|
|
|
|
|
def auth_user(username, password): |
|
return os.environ["USERNAME"] == username and os.environ["PASSWORD"] == password |
|
|
|
|
|
def convert_to_docs( |
|
docs_str: List[str], metadata: Dict[str, Union[str, int, float]] |
|
) -> List[Document]: |
|
docs = [] |
|
for doc in docs_str: |
|
docs.append(Document(page_content=doc, metadata=metadata.copy())) |
|
return docs |