Spaces:

orrinin
/

RAG

Sleeping

orrinin commited on Jun 18, 2024

Commit

4b6695b

verified ·

1 Parent(s): 09813a1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
-from bs4 import BeautifulSoup
 from llama_index.core import Document
 from llama_index.core import Settings
 from llama_index.core import SimpleDirectoryReader
 from llama_index.core import StorageContext
 from llama_index.core import VectorStoreIndex
-from llama_index.readers.web import SimpleWebPageReader
 from llama_index.vector_stores.chroma import ChromaVectorStore
@@ -48,22 +47,27 @@ Settings.num_output = 512
 db_path=""
 def extract_web(url):
-    web_documents = SimpleWebPageReader().load_data(
-        [url]
-    )
-    html_content = web_documents[0].text
-    # Parse the data.
-    soup = BeautifulSoup(html_content, 'html.parser')
-    p_tags = soup.findAll('p')
-    text_content = ""
-    for each in p_tags:
-        text_content += each.text + "\n"
-    # Convert back to Document format
-    documents = [Document(text=text_content)]
-    option = "web"
-    return documents, option
 def extract_doc(path):
     documents = SimpleDirectoryReader(input_files=path).load_data()

 import os
+import httpx
 from llama_index.core import Document
 from llama_index.core import Settings
 from llama_index.core import SimpleDirectoryReader
 from llama_index.core import StorageContext
 from llama_index.core import VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
 db_path=""
+def validate_url(url):
+    try:
+        response = httpx.get(url, timeout=60.0)
+        response.raise_for_status()
+        text = [Document(text=response.text)]
+        option = "web"
+        return text, option
+    except httpx.RequestError as e:
+        raise gr.Error(f"An error occurred while requesting {url}: {str(e)}")
+    except httpx.HTTPStatusError as e:
+        raise gr.Error(f"Error response {e.response.status_code} while requesting {url}")
+    except Exception as e:
+        raise gr.Error(f"An unexpected error occurred: {str(e)}")
 def extract_web(url):
+    print("Entered Webpage Extraction")
+    prefix_url = "https://r.jina.ai/"
+    full_url = prefix_url + url
+    print(full_url)
+    print("Exited Webpage Extraction")
+    return validate_url(full_url)
 def extract_doc(path):
     documents = SimpleDirectoryReader(input_files=path).load_data()