import os import requests from typing import List, Dict, Union import time import mimetypes from langchain_core.documents.base import Document def parse_file(api_key, file_path) -> str: headers = {"Authorization": f"Bearer {api_key}"} base_url = "https://api.cloud.llamaindex.ai/api/parsing" with open(file_path, "rb") as f: mime_type = mimetypes.guess_type(file_path)[0] files = {"file": (f.name, f, mime_type)} # send the request, upload the file url = f"{base_url}/upload" response = requests.post(url, headers=headers, files=files) response.raise_for_status() # get the job id for the result_url job_id = response.json()["id"] result_type = "text" # or "markdown" result_url = f"{base_url}/job/{job_id}/result/{result_type}" # check for the result until its ready while True: response = requests.get(result_url, headers=headers) if response.status_code == 200: break time.sleep(1) # download the result result = response.json() output = result[result_type] return output def get_paged_text(text: str, separator: str = "\n---\n") -> List[str]: """Split each document into page node, by separator.""" pages = [] doc_chunks = text.split(separator) for doc_chunk in doc_chunks: if doc_chunk: pages.append(doc_chunk) return pages def auth_user(username, password): return os.environ["USERNAME"] == username and os.environ["PASSWORD"] == password def convert_to_docs( docs_str: List[str], metadata: Dict[str, Union[str, int, float]] ) -> List[Document]: docs = [] for doc in docs_str: docs.append(Document(page_content=doc, metadata=metadata.copy())) return docs