umairahmad89
initial commit
67a91b0
import os
import requests
from typing import List, Dict, Union
import time
import mimetypes
from langchain_core.documents.base import Document
def parse_file(api_key, file_path) -> str:
headers = {"Authorization": f"Bearer {api_key}"}
base_url = "https://api.cloud.llamaindex.ai/api/parsing"
with open(file_path, "rb") as f:
mime_type = mimetypes.guess_type(file_path)[0]
files = {"file": (f.name, f, mime_type)}
# send the request, upload the file
url = f"{base_url}/upload"
response = requests.post(url, headers=headers, files=files)
response.raise_for_status()
# get the job id for the result_url
job_id = response.json()["id"]
result_type = "text" # or "markdown"
result_url = f"{base_url}/job/{job_id}/result/{result_type}"
# check for the result until its ready
while True:
response = requests.get(result_url, headers=headers)
if response.status_code == 200:
break
time.sleep(1)
# download the result
result = response.json()
output = result[result_type]
return output
def get_paged_text(text: str, separator: str = "\n---\n") -> List[str]:
"""Split each document into page node, by separator."""
pages = []
doc_chunks = text.split(separator)
for doc_chunk in doc_chunks:
if doc_chunk:
pages.append(doc_chunk)
return pages
def auth_user(username, password):
return os.environ["USERNAME"] == username and os.environ["PASSWORD"] == password
def convert_to_docs(
docs_str: List[str], metadata: Dict[str, Union[str, int, float]]
) -> List[Document]:
docs = []
for doc in docs_str:
docs.append(Document(page_content=doc, metadata=metadata.copy()))
return docs