michaelwechner
commited on
Commit
·
6feb084
1
Parent(s):
9a0d22f
use UnstructuredHTMLLoader
Browse files- kg_builder/src/graph_creation.py +14 -10
kg_builder/src/graph_creation.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import logging
|
2 |
|
3 |
-
from langchain_community.document_loaders import WikipediaLoader
|
4 |
from langchain.text_splitter import TokenTextSplitter
|
5 |
from knowledge_graph_builder import extract_and_store_graph
|
6 |
from dotenv import load_dotenv
|
@@ -15,18 +15,21 @@ load_dotenv()
|
|
15 |
|
16 |
# IMPORTANT: Make sure data source names match with values inside api_connections.py
|
17 |
# Define articles / topics to load
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
|
23 |
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
|
24 |
articles = {
|
25 |
"Traffic Law": "Traffic laws in the United States"
|
26 |
}
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
|
31 |
def build_graph_for_article(query, data_source_name):
|
32 |
"""
|
@@ -43,7 +46,8 @@ def build_graph_for_article(query, data_source_name):
|
|
43 |
|
44 |
if data_source_name == "SquirroDocs":
|
45 |
logger.info(f"Loading document(s) from public website {query} ...")
|
46 |
-
|
|
|
47 |
else:
|
48 |
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
49 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
@@ -52,7 +56,7 @@ def build_graph_for_article(query, data_source_name):
|
|
52 |
logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
|
53 |
return
|
54 |
|
55 |
-
logger.info(f"{str(len(raw_documents))} document(s) loaded
|
56 |
for doc in raw_documents:
|
57 |
logger.info(f"Document: {doc.metadata['source']}")
|
58 |
#print(f"Document: {doc.page_content}")
|
|
|
1 |
import logging
|
2 |
|
3 |
+
from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader
|
4 |
from langchain.text_splitter import TokenTextSplitter
|
5 |
from knowledge_graph_builder import extract_and_store_graph
|
6 |
from dotenv import load_dotenv
|
|
|
15 |
|
16 |
# IMPORTANT: Make sure data source names match with values inside api_connections.py
|
17 |
# Define articles / topics to load
|
18 |
+
articlesDISABLED = {
|
19 |
+
"Chemotherapy": "Chemotherapy",
|
20 |
+
"Traffic Law": "Traffic laws in the United States"
|
21 |
+
}
|
22 |
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
|
23 |
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
|
24 |
articles = {
|
25 |
"Traffic Law": "Traffic laws in the United States"
|
26 |
}
|
27 |
+
articlesDISABLED = {
|
28 |
+
"SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
|
29 |
+
}
|
30 |
+
articlesDISABLED = {
|
31 |
+
"SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html"
|
32 |
+
}
|
33 |
|
34 |
def build_graph_for_article(query, data_source_name):
|
35 |
"""
|
|
|
46 |
|
47 |
if data_source_name == "SquirroDocs":
|
48 |
logger.info(f"Loading document(s) from public website {query} ...")
|
49 |
+
loader = UnstructuredHTMLLoader(query)
|
50 |
+
raw_documents = loader.load()
|
51 |
else:
|
52 |
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
53 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
|
|
56 |
logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
|
57 |
return
|
58 |
|
59 |
+
logger.info(f"{str(len(raw_documents))} document(s) loaded.")
|
60 |
for doc in raw_documents:
|
61 |
logger.info(f"Document: {doc.metadata['source']}")
|
62 |
#print(f"Document: {doc.page_content}")
|