michaelwechner commited on
Commit
6feb084
·
1 Parent(s): 9a0d22f

use UnstructuredHTMLLoader

Browse files
Files changed (1) hide show
  1. kg_builder/src/graph_creation.py +14 -10
kg_builder/src/graph_creation.py CHANGED
@@ -1,6 +1,6 @@
1
  import logging
2
 
3
- from langchain_community.document_loaders import WikipediaLoader
4
  from langchain.text_splitter import TokenTextSplitter
5
  from knowledge_graph_builder import extract_and_store_graph
6
  from dotenv import load_dotenv
@@ -15,18 +15,21 @@ load_dotenv()
15
 
16
  # IMPORTANT: Make sure data source names match with values inside api_connections.py
17
  # Define articles / topics to load
18
- #articles = {
19
- # "Chemotherapy": "Chemotherapy",
20
- # "Traffic Law": "Traffic laws in the United States"
21
- #}
22
  # Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
23
  # Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
24
  articles = {
25
  "Traffic Law": "Traffic laws in the United States"
26
  }
27
- #articles = {
28
- # "SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
29
- #}
 
 
 
30
 
31
  def build_graph_for_article(query, data_source_name):
32
  """
@@ -43,7 +46,8 @@ def build_graph_for_article(query, data_source_name):
43
 
44
  if data_source_name == "SquirroDocs":
45
  logger.info(f"Loading document(s) from public website {query} ...")
46
- raw_documents = None
 
47
  else:
48
  logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
49
  raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
@@ -52,7 +56,7 @@ def build_graph_for_article(query, data_source_name):
52
  logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
53
  return
54
 
55
- logger.info(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
56
  for doc in raw_documents:
57
  logger.info(f"Document: {doc.metadata['source']}")
58
  #print(f"Document: {doc.page_content}")
 
1
  import logging
2
 
3
+ from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader
4
  from langchain.text_splitter import TokenTextSplitter
5
  from knowledge_graph_builder import extract_and_store_graph
6
  from dotenv import load_dotenv
 
15
 
16
  # IMPORTANT: Make sure data source names match with values inside api_connections.py
17
  # Define articles / topics to load
18
+ articlesDISABLED = {
19
+ "Chemotherapy": "Chemotherapy",
20
+ "Traffic Law": "Traffic laws in the United States"
21
+ }
22
  # Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
23
  # Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
24
  articles = {
25
  "Traffic Law": "Traffic laws in the United States"
26
  }
27
+ articlesDISABLED = {
28
+ "SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
29
+ }
30
+ articlesDISABLED = {
31
+ "SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html"
32
+ }
33
 
34
  def build_graph_for_article(query, data_source_name):
35
  """
 
46
 
47
  if data_source_name == "SquirroDocs":
48
  logger.info(f"Loading document(s) from public website {query} ...")
49
+ loader = UnstructuredHTMLLoader(query)
50
+ raw_documents = loader.load()
51
  else:
52
  logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
53
  raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
 
56
  logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
57
  return
58
 
59
+ logger.info(f"{str(len(raw_documents))} document(s) loaded.")
60
  for doc in raw_documents:
61
  logger.info(f"Document: {doc.metadata['source']}")
62
  #print(f"Document: {doc.page_content}")