Spaces:

rchrdgwr
/

AI4Midterm

Sleeping

App Files Files Community

AI4Midterm / utilities /doc_utilities.py

rchrdgwr

Updated application

c6907ac 4 months ago

raw

history blame contribute delete

2.09 kB

	from langchain_community.document_loaders import PyMuPDFLoader
	import fitz
	import os
	import requests

	from utilities.debugger import dprint
	import uuid



	def download_document(app_state, url, file_name, download_folder):
	file_path = os.path.join(download_folder, file_name)
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)

	if not os.path.exists(file_path):
	print(f"Downloading {file_name} from {url}...")
	response = requests.get(url)
	if response.status_code == 200:
	with open(file_path, 'wb') as f:
	f.write(response.content)
	else:
	dprint(app_state, f"Failed to download document from {url}. Status code: {response.status_code}")
	else:
	dprint(app_state, f"{file_name} already exists locally.")
	return file_path

	def get_documents(app_state):
	for url in app_state.document_urls:
	dprint(app_state, f"Downloading and loading document from {url}...")
	file_name = url.split("/")[-1]
	file_path = download_document(app_state, url, file_name, app_state.download_folder)
	loader = PyMuPDFLoader(file_path)
	loaded_document = loader.load()
	single_text_document = "\n".join([doc.page_content for doc in loaded_document])
	dprint(app_state, f"Number of pages: {len(loaded_document)}")
	# lets get titles and metadata
	pdf = fitz.open(file_path)
	metadata = pdf.metadata
	title = metadata.get('title', 'Document 1')

	document = {
	"url": url,
	"title": title,
	"metadata": metadata,
	"loaded_document": loaded_document,
	"single_text_document": single_text_document,
	"document_id": str(uuid.uuid4())
	}
	app_state.add_document(document)
	dprint(app_state, f"Title of Document: {title}")
	dprint(app_state, f"Full metadata for Document 1: {metadata}")
	pdf.close()
	print(f"Total documents: {len(app_state.documents)}")