orrinin commited on
Commit
4b6695b
·
verified ·
1 Parent(s): 09813a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
- from bs4 import BeautifulSoup
3
  from llama_index.core import Document
4
  from llama_index.core import Settings
5
  from llama_index.core import SimpleDirectoryReader
6
  from llama_index.core import StorageContext
7
  from llama_index.core import VectorStoreIndex
8
- from llama_index.readers.web import SimpleWebPageReader
9
 
10
  from llama_index.vector_stores.chroma import ChromaVectorStore
11
 
@@ -48,22 +47,27 @@ Settings.num_output = 512
48
 
49
  db_path=""
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def extract_web(url):
52
- web_documents = SimpleWebPageReader().load_data(
53
- [url]
54
- )
55
- html_content = web_documents[0].text
56
- # Parse the data.
57
- soup = BeautifulSoup(html_content, 'html.parser')
58
- p_tags = soup.findAll('p')
59
- text_content = ""
60
- for each in p_tags:
61
- text_content += each.text + "\n"
62
-
63
- # Convert back to Document format
64
- documents = [Document(text=text_content)]
65
- option = "web"
66
- return documents, option
67
 
68
  def extract_doc(path):
69
  documents = SimpleDirectoryReader(input_files=path).load_data()
 
1
  import os
2
+ import httpx
3
  from llama_index.core import Document
4
  from llama_index.core import Settings
5
  from llama_index.core import SimpleDirectoryReader
6
  from llama_index.core import StorageContext
7
  from llama_index.core import VectorStoreIndex
 
8
 
9
  from llama_index.vector_stores.chroma import ChromaVectorStore
10
 
 
47
 
48
  db_path=""
49
 
50
+ def validate_url(url):
51
+ try:
52
+ response = httpx.get(url, timeout=60.0)
53
+ response.raise_for_status()
54
+ text = [Document(text=response.text)]
55
+ option = "web"
56
+ return text, option
57
+ except httpx.RequestError as e:
58
+ raise gr.Error(f"An error occurred while requesting {url}: {str(e)}")
59
+ except httpx.HTTPStatusError as e:
60
+ raise gr.Error(f"Error response {e.response.status_code} while requesting {url}")
61
+ except Exception as e:
62
+ raise gr.Error(f"An unexpected error occurred: {str(e)}")
63
+
64
  def extract_web(url):
65
+ print("Entered Webpage Extraction")
66
+ prefix_url = "https://r.jina.ai/"
67
+ full_url = prefix_url + url
68
+ print(full_url)
69
+ print("Exited Webpage Extraction")
70
+ return validate_url(full_url)
 
 
 
 
 
 
 
 
 
71
 
72
  def extract_doc(path):
73
  documents = SimpleDirectoryReader(input_files=path).load_data()