Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import os
|
2 |
-
|
3 |
from llama_index.core import Document
|
4 |
from llama_index.core import Settings
|
5 |
from llama_index.core import SimpleDirectoryReader
|
6 |
from llama_index.core import StorageContext
|
7 |
from llama_index.core import VectorStoreIndex
|
8 |
-
from llama_index.readers.web import SimpleWebPageReader
|
9 |
|
10 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
11 |
|
@@ -48,22 +47,27 @@ Settings.num_output = 512
|
|
48 |
|
49 |
db_path=""
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def extract_web(url):
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
p_tags = soup.findAll('p')
|
59 |
-
text_content = ""
|
60 |
-
for each in p_tags:
|
61 |
-
text_content += each.text + "\n"
|
62 |
-
|
63 |
-
# Convert back to Document format
|
64 |
-
documents = [Document(text=text_content)]
|
65 |
-
option = "web"
|
66 |
-
return documents, option
|
67 |
|
68 |
def extract_doc(path):
|
69 |
documents = SimpleDirectoryReader(input_files=path).load_data()
|
|
|
1 |
import os
|
2 |
+
import httpx
|
3 |
from llama_index.core import Document
|
4 |
from llama_index.core import Settings
|
5 |
from llama_index.core import SimpleDirectoryReader
|
6 |
from llama_index.core import StorageContext
|
7 |
from llama_index.core import VectorStoreIndex
|
|
|
8 |
|
9 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
10 |
|
|
|
47 |
|
48 |
db_path=""
|
49 |
|
50 |
+
def validate_url(url):
|
51 |
+
try:
|
52 |
+
response = httpx.get(url, timeout=60.0)
|
53 |
+
response.raise_for_status()
|
54 |
+
text = [Document(text=response.text)]
|
55 |
+
option = "web"
|
56 |
+
return text, option
|
57 |
+
except httpx.RequestError as e:
|
58 |
+
raise gr.Error(f"An error occurred while requesting {url}: {str(e)}")
|
59 |
+
except httpx.HTTPStatusError as e:
|
60 |
+
raise gr.Error(f"Error response {e.response.status_code} while requesting {url}")
|
61 |
+
except Exception as e:
|
62 |
+
raise gr.Error(f"An unexpected error occurred: {str(e)}")
|
63 |
+
|
64 |
def extract_web(url):
|
65 |
+
print("Entered Webpage Extraction")
|
66 |
+
prefix_url = "https://r.jina.ai/"
|
67 |
+
full_url = prefix_url + url
|
68 |
+
print(full_url)
|
69 |
+
print("Exited Webpage Extraction")
|
70 |
+
return validate_url(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def extract_doc(path):
|
73 |
documents = SimpleDirectoryReader(input_files=path).load_data()
|