Prakhar Bhandari commited on
Commit
c8025cd
·
1 Parent(s): 2d9bc04

Modular v1.0

Browse files
kg_builder/.DS_Store ADDED
Binary file (6.15 kB). View file
 
kg_builder/README.md ADDED
File without changes
kg_builder/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ requests
4
+ openai
5
+ neo4j
6
+ wikipedia
7
+ tiktoken
8
+ langchain
9
+ langchain_openai
10
+ tqdm
kg_builder/src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
kg_builder/src/__init__.py ADDED
File without changes
kg_builder/src/__pycache__/api_connections.cpython-39.pyc ADDED
Binary file (507 Bytes). View file
 
kg_builder/src/__pycache__/knowledge_graph_builder.cpython-39.pyc ADDED
Binary file (7.19 kB). View file
 
kg_builder/src/__pycache__/query_graph.cpython-39.pyc ADDED
Binary file (829 Bytes). View file
 
kg_builder/src/api_connections.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_community.graphs import Neo4jGraph
3
+ import os
4
+
5
+ # Neo4j connection setup
6
+ url = "neo4j+s://2f409740.databases.neo4j.io"
7
+ username = "neo4j"
8
+ password = "oe7A9ugxhxcuEtwci8khPIt2TTdz_am9AYDx1r9e9Tw"
9
+ graph = Neo4jGraph(
10
+ url=url,
11
+ username=username,
12
+ password=password
13
+ )
14
+
15
+ # OpenAI API key setup
16
+ os.environ["OPENAI_API_KEY"] = "sk-proj-hceIL56CC2zfjAvAlMjbT3BlbkFJyHKX2wbiQxsG9yy8dGJN"
kg_builder/src/knowledge_graph_builder.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Add to knowledge_graph_builder.py
3
+ from api_connections import graph
4
+
5
+ from langchain_community.graphs.graph_document import (
6
+ Node as BaseNode,
7
+ Relationship as BaseRelationship,
8
+ GraphDocument,
9
+ )
10
+ from langchain.schema import Document
11
+ from typing import List, Dict, Any, Optional
12
+ from langchain.pydantic_v1 import Field, BaseModel
13
+
14
+ class Property(BaseModel):
15
+ """A single property consisting of key and value"""
16
+ key: str = Field(..., description="key")
17
+ value: str = Field(..., description="value")
18
+
19
+ class Node(BaseNode):
20
+ properties: Optional[List[Property]] = Field(
21
+ None, description="List of node properties")
22
+
23
+ class Relationship(BaseRelationship):
24
+ properties: Optional[List[Property]] = Field(
25
+ None, description="List of relationship properties"
26
+ )
27
+
28
+ class KnowledgeGraph(BaseModel):
29
+ """Generate a knowledge graph with entities and relationships."""
30
+ nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
31
+ rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")
32
+
33
+ def format_property_key(s: str) -> str:
34
+ words = s.split()
35
+ if not words:
36
+ return s
37
+ first_word = words[0].lower()
38
+ capitalized_words = [word.capitalize() for word in words[1:]]
39
+ return "".join([first_word] + capitalized_words)
40
+
41
+ def props_to_dict(props) -> dict:
42
+ """Convert properties to a dictionary."""
43
+ properties = {}
44
+ if not props:
45
+ return properties
46
+ for p in props:
47
+ properties[format_property_key(p.key)] = p.value
48
+ return properties
49
+
50
+ def map_to_base_node(node: Node) -> BaseNode:
51
+ """Map the KnowledgeGraph Node to the base Node."""
52
+ properties = props_to_dict(node.properties) if node.properties else {}
53
+ properties["name"] = node.id.title() # Assuming nodes have an 'id' attribute for this operation
54
+ return BaseNode(
55
+ id=node.id.title(), type=node.type.capitalize(), properties=properties
56
+ )
57
+
58
+ def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
59
+ """Map the KnowledgeGraph Relationship to the base Relationship."""
60
+ source = map_to_base_node(rel.source)
61
+ target = map_to_base_node(rel.target)
62
+ properties = props_to_dict(rel.properties) if rel.properties else {}
63
+ return BaseRelationship(
64
+ source=source, target=target, type=rel.type, properties=properties
65
+ )
66
+
67
+ import os
68
+ from langchain.chains.openai_functions import (
69
+ create_openai_fn_chain,
70
+ create_structured_output_runnable,
71
+ create_structured_output_chain,
72
+ )
73
+ from langchain_openai import ChatOpenAI
74
+ from langchain.prompts import ChatPromptTemplate
75
+
76
+ # Setting the OpenAI API key for usage in LLM calls
77
+ os.environ["OPENAI_API_KEY"] = "sk-proj-hceIL56CC2zfjAvAlMjbT3BlbkFJyHKX2wbiQxsG9yy8dGJN"
78
+ llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
79
+
80
+ def get_extraction_chain(
81
+ allowed_nodes: Optional[List[str]] = None,
82
+ allowed_rels: Optional[List[str]] = None
83
+ ):
84
+ prompt = ChatPromptTemplate.from_messages(
85
+ [(
86
+ "system",
87
+ f"""# Knowledge Graph Instructions for GPT-4
88
+ ## 1. Overview
89
+ You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.
90
+ - **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.
91
+ - The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.
92
+
93
+ ## 2. Labeling Nodes
94
+ - **Consistency**: Utilize uniform labels for node types to maintain clarity.
95
+ - For instance, consistently label drugs as **"Drug"**, symptoms as **"Symptom"**, and treatments as **"Treatment"**.
96
+ - **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.
97
+
98
+ {'- **Allowed Node Labels:**' + ", ".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else ""}
99
+ {'- **Allowed Relationship Types**:' + ", ".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else ""}
100
+
101
+ ## 3. Handling Numerical Data and Dates
102
+ - Integrate numerical data and dates as attributes of the corresponding nodes.
103
+ - **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.
104
+ - **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.
105
+
106
+ ## 4. Coreference Resolution
107
+ - **Entity Consistency**: Guarantee uniform identification of each entity across the graph.
108
+ - For example, if "Methotrexate" and "MTX" reference the same medication, uniformly apply "Methotrexate" as the node ID.
109
+
110
+ ## 5. Relationship Naming Conventions
111
+ - **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.
112
+ - For instance, use "HAS_SIDE_EFFECT" instead of "HASSIDEEFFECT", use "CAN_RESULT_FROM" instead of "CANRESULTFROM" etc. You keep making the same mistakes of storing the relationships without the "_" in between the words. Any further similar errors will lead to termination.
113
+ - **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as "INHIBITS" or "ACTIVATES" for interactions between substances.
114
+
115
+ ## 6. Strict Compliance
116
+ Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.
117
+ """),
118
+ ("human", "Use the given format to extract information from the following input: {input}"),
119
+ ("human", "Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph."),
120
+ ])
121
+ return create_structured_output_chain(KnowledgeGraph, llm, prompt)
122
+
123
+ def extract_and_store_graph(
124
+ document: Document,
125
+ nodes:Optional[List[str]] = None,
126
+ rels:Optional[List[str]]=None) -> None:
127
+ # Extract graph data using OpenAI functions
128
+ extract_chain = get_extraction_chain(nodes, rels)
129
+ data = extract_chain.invoke(document.page_content)['function']
130
+ # Construct a graph document
131
+ graph_document = GraphDocument(
132
+ nodes = [map_to_base_node(node) for node in data.nodes],
133
+ relationships = [map_to_base_relationship(rel) for rel in data.rels],
134
+ source = document
135
+ )
136
+ # Store information into a graph
137
+ graph.add_graph_documents([graph_document])
138
+
kg_builder/src/main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from knowledge_graph_builder import extract_and_store_graph
2
+ from query_graph import query_knowledge_graph
3
+ from langchain_community.document_loaders import WikipediaLoader
4
+ from langchain.text_splitter import TokenTextSplitter
5
+ from tqdm import tqdm
6
+
7
+ def main():
8
+ print("Starting the script...")
9
+ # Take Wikipedia article name as input
10
+ article_name = input("Enter the Wikipedia article name: ") # Corrected to proper input usage
11
+
12
+ print(f"Loading documents for: {article_name}")
13
+ # Load and process the Wikipedia article
14
+ raw_documents = WikipediaLoader(query=article_name).load()
15
+ text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
16
+ documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents
17
+
18
+ print("Building the knowledge graph...")
19
+ # Build the knowledge graph from the documents
20
+ for i, d in tqdm(enumerate(documents), total=len(documents)):
21
+ extract_and_store_graph(d)
22
+
23
+ print("Graph construction complete. Please enter your query.")
24
+ # Take a query related to the graph
25
+ user_query = input("Enter your query related to the graph: ")
26
+
27
+ print(f"Querying the graph with: {user_query}")
28
+ # Query the graph and print the answer
29
+ answer = query_knowledge_graph(user_query)
30
+ print("Answer to your query:", answer)
31
+
32
+ if __name__ == "__main__":
33
+ main()
kg_builder/src/query_graph.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import GraphCypherQAChain
2
+ from langchain_openai import ChatOpenAI
3
+ from api_connections import graph # Importing 'graph' from 'api_connections.py'
4
+
5
+ def query_knowledge_graph(query):
6
+ print("Refreshing the graph schema...")
7
+ # Refresh the graph schema before querying
8
+ graph.refresh_schema()
9
+
10
+ print("Setting up the Cypher QA Chain...")
11
+ # Setup the Cypher QA Chain with specific LLM configurations
12
+ cypher_chain = GraphCypherQAChain.from_llm(
13
+ graph=graph,
14
+ cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
15
+ qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
16
+ #verbose=True
17
+ )
18
+
19
+ print(f"Executing the query: {query}")
20
+ # Execute the query and return results
21
+ result = cypher_chain.invoke({"query": query})
22
+ print("Query executed. Processing results...")
23
+ return result
kg_creation.ipynb DELETED
@@ -1,473 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import os\n",
10
- "os.environ['OPENAI_API_KEY'] = \"sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNps\"\n",
11
- "\n",
12
- "import logging\n",
13
- "import sys\n",
14
- "\n",
15
- "logging.basicConfig(\n",
16
- " stream=sys.stdout, level=logging.INFO\n",
17
- ") # logging.DEBUG for more verbose output\n",
18
- "\n",
19
- "\n",
20
- "# define LLM\n",
21
- "from llama_index.llms.openai import OpenAI\n",
22
- "from llama_index.core import Settings\n",
23
- "\n",
24
- "Settings.llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\")\n",
25
- "Settings.chunk_size = 512"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": 21,
31
- "metadata": {},
32
- "outputs": [
33
- {
34
- "name": "stdout",
35
- "output_type": "stream",
36
- "text": [
37
- "Requirement already satisfied: langchain in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.16)\n",
38
- "Requirement already satisfied: neo4j in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (5.19.0)\n",
39
- "Requirement already satisfied: openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.23.2)\n",
40
- "Requirement already satisfied: wikipedia in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (1.4.0)\n",
41
- "Requirement already satisfied: tiktoken in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.6.0)\n",
42
- "Requirement already satisfied: langchain_openai in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (0.1.3)\n",
43
- "Requirement already satisfied: PyYAML>=5.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (6.0.1)\n",
44
- "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.0.29)\n",
45
- "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (3.9.5)\n",
46
- "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (4.0.3)\n",
47
- "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.6.4)\n",
48
- "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.33)\n",
49
- "Requirement already satisfied: langchain-community<0.1,>=0.0.32 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.34)\n",
50
- "Requirement already satisfied: langchain-core<0.2.0,>=0.1.42 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.45)\n",
51
- "Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.0.1)\n",
52
- "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (0.1.49)\n",
53
- "Requirement already satisfied: numpy<2,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (1.26.4)\n",
54
- "Requirement already satisfied: pydantic<3,>=1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.7.0)\n",
55
- "Requirement already satisfied: requests<3,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (2.31.0)\n",
56
- "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain) (8.2.3)\n",
57
- "Requirement already satisfied: pytz in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from neo4j) (2024.1)\n",
58
- "Requirement already satisfied: anyio<5,>=3.5.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.3.0)\n",
59
- "Requirement already satisfied: distro<2,>=1.7.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.9.0)\n",
60
- "Requirement already satisfied: httpx<1,>=0.23.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (0.27.0)\n",
61
- "Requirement already satisfied: sniffio in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (1.3.1)\n",
62
- "Requirement already satisfied: tqdm>4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.66.2)\n",
63
- "Requirement already satisfied: typing-extensions<5,>=4.7 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from openai) (4.11.0)\n",
64
- "Requirement already satisfied: beautifulsoup4 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from wikipedia) (4.12.3)\n",
65
- "Requirement already satisfied: regex>=2022.1.18 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from tiktoken) (2024.4.16)\n",
66
- "Requirement already satisfied: aiosignal>=1.1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
67
- "Requirement already satisfied: attrs>=17.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
68
- "Requirement already satisfied: frozenlist>=1.1.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
69
- "Requirement already satisfied: multidict<7.0,>=4.5 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
70
- "Requirement already satisfied: yarl<2.0,>=1.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
71
- "Requirement already satisfied: idna>=2.8 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
72
- "Requirement already satisfied: exceptiongroup>=1.0.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n",
73
- "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.1)\n",
74
- "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
75
- "Requirement already satisfied: certifi in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (2024.2.2)\n",
76
- "Requirement already satisfied: httpcore==1.* in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpx<1,>=0.23.0->openai) (1.0.5)\n",
77
- "Requirement already satisfied: h11<0.15,>=0.13 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
78
- "Requirement already satisfied: jsonpointer>=1.9 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n",
79
- "Requirement already satisfied: packaging<24.0,>=23.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langchain-core<0.2.0,>=0.1.42->langchain) (23.2)\n",
80
- "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.1)\n",
81
- "Requirement already satisfied: annotated-types>=0.4.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
82
- "Requirement already satisfied: pydantic-core==2.18.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (2.18.1)\n",
83
- "Requirement already satisfied: charset-normalizer<4,>=2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
84
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from requests<3,>=2->langchain) (2.2.1)\n",
85
- "Requirement already satisfied: greenlet!=0.4.17 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
86
- "Requirement already satisfied: soupsieve>1.2 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from beautifulsoup4->wikipedia) (2.5)\n",
87
- "Requirement already satisfied: mypy-extensions>=0.3.0 in /local/home/pbhandari/miniconda3/envs/graph_rag/lib/python3.9/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
88
- ]
89
- }
90
- ],
91
- "source": [
92
- "!pip install langchain neo4j openai wikipedia tiktoken langchain_openai"
93
- ]
94
- },
95
- {
96
- "cell_type": "code",
97
- "execution_count": 2,
98
- "metadata": {},
99
- "outputs": [],
100
- "source": [
101
- "from langchain.graphs import Neo4jGraph\n",
102
- "\n",
103
- "url = \"neo4j+s://2f409740.databases.neo4j.io\"\n",
104
- "username =\"neo4j\"\n",
105
- "password = \"oe7A9ugxhxcuEtwci8khPIt2TTdz_am9AYDx1r9e9Tpw\"\n",
106
- "graph = Neo4jGraph(\n",
107
- " url=url,\n",
108
- " username=username,\n",
109
- " password=password\n",
110
- ")"
111
- ]
112
- },
113
- {
114
- "cell_type": "code",
115
- "execution_count": 4,
116
- "metadata": {},
117
- "outputs": [],
118
- "source": [
119
- "from langchain_community.graphs.graph_document import (\n",
120
- " Node as BaseNode,\n",
121
- " Relationship as BaseRelationship,\n",
122
- " GraphDocument,\n",
123
- ")\n",
124
- "from langchain.schema import Document\n",
125
- "from typing import List, Dict, Any, Optional\n",
126
- "from langchain.pydantic_v1 import Field, BaseModel\n",
127
- "\n",
128
- "class Property(BaseModel):\n",
129
- " \"\"\"A single property consisting of key and value\"\"\"\n",
130
- " key: str = Field(..., description=\"key\")\n",
131
- " value: str = Field(..., description=\"value\")\n",
132
- "\n",
133
- "class Node(BaseNode):\n",
134
- " properties: Optional[List[Property]] = Field(\n",
135
- " None, description=\"List of node properties\")\n",
136
- "\n",
137
- "class Relationship(BaseRelationship):\n",
138
- " properties: Optional[List[Property]] = Field(\n",
139
- " None, description=\"List of relationship properties\"\n",
140
- " )\n",
141
- "\n",
142
- "class KnowledgeGraph(BaseModel):\n",
143
- " \"\"\"Generate a knowledge graph with entities and relationships.\"\"\"\n",
144
- " nodes: List[Node] = Field(\n",
145
- " ..., description=\"List of nodes in the knowledge graph\")\n",
146
- " rels: List[Relationship] = Field(\n",
147
- " ..., description=\"List of relationships in the knowledge graph\"\n",
148
- " )"
149
- ]
150
- },
151
- {
152
- "cell_type": "code",
153
- "execution_count": 5,
154
- "metadata": {},
155
- "outputs": [],
156
- "source": [
157
- "def format_property_key(s: str) -> str:\n",
158
- " words = s.split()\n",
159
- " if not words:\n",
160
- " return s\n",
161
- " first_word = words[0].lower()\n",
162
- " capitalized_words = [word.capitalize() for word in words[1:]]\n",
163
- " return \"\".join([first_word] + capitalized_words)\n",
164
- "\n",
165
- "def props_to_dict(props) -> dict:\n",
166
- " \"\"\"Convert properties to a dictionary.\"\"\"\n",
167
- " properties = {}\n",
168
- " if not props:\n",
169
- " return properties\n",
170
- " for p in props:\n",
171
- " properties[format_property_key(p.key)] = p.value\n",
172
- " return properties\n",
173
- "\n",
174
- "def map_to_base_node(node: Node) -> BaseNode:\n",
175
- " \"\"\"Map the KnowledgeGraph Node to the base Node.\"\"\"\n",
176
- " properties = props_to_dict(node.properties) if node.properties else {}\n",
177
- " # Add name property for better Cypher statement generation\n",
178
- " properties[\"name\"] = node.id.title()\n",
179
- " return BaseNode(\n",
180
- " id=node.id.title(), type=node.type.capitalize(), properties=properties\n",
181
- " )\n",
182
- "\n",
183
- "\n",
184
- "def map_to_base_relationship(rel: Relationship) -> BaseRelationship:\n",
185
- " \"\"\"Map the KnowledgeGraph Relationship to the base Relationship.\"\"\"\n",
186
- " source = map_to_base_node(rel.source)\n",
187
- " target = map_to_base_node(rel.target)\n",
188
- " properties = props_to_dict(rel.properties) if rel.properties else {}\n",
189
- " return BaseRelationship(\n",
190
- " source=source, target=target, type=rel.type, properties=properties\n",
191
- " )"
192
- ]
193
- },
194
- {
195
- "cell_type": "code",
196
- "execution_count": 17,
197
- "metadata": {},
198
- "outputs": [],
199
- "source": [
200
- "import os\n",
201
- "from langchain.chains.openai_functions import (\n",
202
- " create_openai_fn_chain,\n",
203
- " create_structured_output_chain,\n",
204
- ")\n",
205
- "from langchain_openai import ChatOpenAI\n",
206
- "from langchain.prompts import ChatPromptTemplate\n",
207
- "\n",
208
- "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-k8uMlsAJbdAuSWWnvaHyT3BlbkFJyQB8yMQavFuQDVmc4sNs\"\n",
209
- "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0)\n",
210
- "\n",
211
- "def get_extraction_chain(\n",
212
- " allowed_nodes: Optional[List[str]] = None,\n",
213
- " allowed_rels: Optional[List[str]] = None\n",
214
- " ):\n",
215
- " prompt = ChatPromptTemplate.from_messages(\n",
216
- " [(\n",
217
- " \"system\",\n",
218
- " f\"\"\"# Knowledge Graph Instructions for GPT-4\n",
219
- "## 1. Overview\n",
220
- "You are a sophisticated algorithm tailored for parsing Wikipedia pages to construct a knowledge graph about chemotherapy and related cancer treatments.\n",
221
- "- **Nodes** symbolize entities such as medical conditions, drugs, symptoms, treatments, and associated medical concepts.\n",
222
- "- The goal is to create a precise and comprehensible knowledge graph, serving as a reliable resource for medical practitioners and scholarly research.\n",
223
- "\n",
224
- "## 2. Labeling Nodes\n",
225
- "- **Consistency**: Utilize uniform labels for node types to maintain clarity.\n",
226
- " - For instance, consistently label drugs as **\"Drug\"**, symptoms as **\"Symptom\"**, and treatments as **\"Treatment\"**.\n",
227
- "- **Node IDs**: Apply descriptive, legible identifiers for node IDs, sourced directly from the text.\n",
228
- "\n",
229
- "{'- **Allowed Node Labels:**' + \", \".join(['Drug', 'Symptom', 'Treatment', 'MedicalCondition', 'ResearchStudy']) if allowed_nodes else \"\"}\n",
230
- "{'- **Allowed Relationship Types**:' + \", \".join(['Treats', 'Causes', 'Researches', 'Recommends']) if allowed_rels else \"\"}\n",
231
- "\n",
232
- "## 3. Handling Numerical Data and Dates\n",
233
- "- Integrate numerical data and dates as attributes of the corresponding nodes.\n",
234
- "- **No Isolated Nodes for Dates/Numbers**: Directly associate dates and numerical figures as attributes with pertinent nodes.\n",
235
- "- **Property Format**: Follow a straightforward key-value pattern for properties, with keys in camelCase, for example, `approvedYear`, `dosageAmount`.\n",
236
- "\n",
237
- "## 4. Coreference Resolution\n",
238
- "- **Entity Consistency**: Guarantee uniform identification of each entity across the graph.\n",
239
- " - For example, if \"Methotrexate\" and \"MTX\" reference the same medication, uniformly apply \"Methotrexate\" as the node ID.\n",
240
- "\n",
241
- "## 5. Relationship Naming Conventions\n",
242
- "- **Clarity and Standardization**: Utilize clear and standardized relationship names, preferring uppercase with underscores for readability.\n",
243
- " - For instance, use \"HAS_SIDE_EFFECT\" instead of \"HASSIDEEFFECT\", use \"CAN_RESULT_FROM\" instead of \"CANRESULTFROM\" etc.\n",
244
- "- **Relevance and Specificity**: Choose relationship names that accurately reflect the connection between nodes, such as \"INHIBITS\" or \"ACTIVATES\" for interactions between substances.\n",
245
- "\n",
246
- "## 6. Strict Compliance\n",
247
- "Rigorous adherence to these instructions is essential. Failure to comply with the specified formatting and labeling norms will necessitate output revision or discard.\n",
248
- " \"\"\"),\n",
249
- " (\"human\", \"Use the given format to extract information from the following input: {input}\"),\n",
250
- " (\"human\", \"Tip: Precision in the node and relationship creation is vital for the integrity of the knowledge graph.\"),\n",
251
- " ])\n",
252
- " return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)"
253
- ]
254
- },
255
- {
256
- "cell_type": "code",
257
- "execution_count": 18,
258
- "metadata": {},
259
- "outputs": [],
260
- "source": [
261
- "def extract_and_store_graph(\n",
262
- " document: Document,\n",
263
- " nodes:Optional[List[str]] = None,\n",
264
- " rels:Optional[List[str]]=None) -> None:\n",
265
- " # Extract graph data using OpenAI functions\n",
266
- " extract_chain = get_extraction_chain(nodes, rels)\n",
267
- " data = extract_chain.invoke(document.page_content)['function']\n",
268
- " # Construct a graph document\n",
269
- " graph_document = GraphDocument(\n",
270
- " nodes = [map_to_base_node(node) for node in data.nodes],\n",
271
- " relationships = [map_to_base_relationship(rel) for rel in data.rels],\n",
272
- " source = document\n",
273
- " )\n",
274
- " # Store information into a graph\n",
275
- " graph.add_graph_documents([graph_document])"
276
- ]
277
- },
278
- {
279
- "cell_type": "code",
280
- "execution_count": 21,
281
- "metadata": {},
282
- "outputs": [],
283
- "source": [
284
- "from langchain.document_loaders import WikipediaLoader\n",
285
- "from langchain.text_splitter import TokenTextSplitter\n",
286
- "\n",
287
- "# Read the wikipedia article\n",
288
- "raw_documents = WikipediaLoader(query=\"Chemotherapy\").load()\n",
289
- "# Define chunking strategy\n",
290
- "text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)\n",
291
- "\n",
292
- "# Only take the first the raw_documents\n",
293
- "documents = text_splitter.split_documents(raw_documents[:5])"
294
- ]
295
- },
296
- {
297
- "cell_type": "code",
298
- "execution_count": 22,
299
- "metadata": {},
300
- "outputs": [
301
- {
302
- "name": "stderr",
303
- "output_type": "stream",
304
- "text": [
305
- " 0%| | 0/5 [00:00<?, ?it/s]"
306
- ]
307
- },
308
- {
309
- "name": "stdout",
310
- "output_type": "stream",
311
- "text": [
312
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
313
- ]
314
- },
315
- {
316
- "name": "stderr",
317
- "output_type": "stream",
318
- "text": [
319
- " 20%|██ | 1/5 [01:11<04:45, 71.44s/it]"
320
- ]
321
- },
322
- {
323
- "name": "stdout",
324
- "output_type": "stream",
325
- "text": [
326
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
327
- ]
328
- },
329
- {
330
- "name": "stderr",
331
- "output_type": "stream",
332
- "text": [
333
- " 40%|████ | 2/5 [01:25<01:53, 37.82s/it]"
334
- ]
335
- },
336
- {
337
- "name": "stdout",
338
- "output_type": "stream",
339
- "text": [
340
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
341
- ]
342
- },
343
- {
344
- "name": "stderr",
345
- "output_type": "stream",
346
- "text": [
347
- " 60%|██████ | 3/5 [01:33<00:48, 24.24s/it]"
348
- ]
349
- },
350
- {
351
- "name": "stdout",
352
- "output_type": "stream",
353
- "text": [
354
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
355
- ]
356
- },
357
- {
358
- "name": "stderr",
359
- "output_type": "stream",
360
- "text": [
361
- " 80%|████████ | 4/5 [01:49<00:20, 20.99s/it]"
362
- ]
363
- },
364
- {
365
- "name": "stdout",
366
- "output_type": "stream",
367
- "text": [
368
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
369
- ]
370
- },
371
- {
372
- "name": "stderr",
373
- "output_type": "stream",
374
- "text": [
375
- "100%|██████████| 5/5 [01:52<00:00, 22.58s/it]\n"
376
- ]
377
- }
378
- ],
379
- "source": [
380
- "from tqdm import tqdm\n",
381
- "\n",
382
- "for i, d in tqdm(enumerate(documents), total=len(documents)):\n",
383
- " extract_and_store_graph(d)"
384
- ]
385
- },
386
- {
387
- "cell_type": "code",
388
- "execution_count": 14,
389
- "metadata": {},
390
- "outputs": [],
391
- "source": [
392
- "# Query the knowledge graph in a RAG application\n",
393
- "from langchain.chains import GraphCypherQAChain\n",
394
- "\n",
395
- "graph.refresh_schema()\n",
396
- "\n",
397
- "cypher_chain = GraphCypherQAChain.from_llm(\n",
398
- " graph=graph,\n",
399
- " cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-4\"),\n",
400
- " qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\"),\n",
401
- " validate_cypher=True, # Validate relationship directions\n",
402
- " verbose=True\n",
403
- ")"
404
- ]
405
- },
406
- {
407
- "cell_type": "code",
408
- "execution_count": 23,
409
- "metadata": {},
410
- "outputs": [
411
- {
412
- "name": "stdout",
413
- "output_type": "stream",
414
- "text": [
415
- "\n",
416
- "\n",
417
- "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
418
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
419
- "Generated Cypher:\n",
420
- "\u001b[32;1m\u001b[1;3mMATCH (c:Condition {name: \"Cancer\"})-[:CANRESULTFROM]->(t:Treatment) RETURN t.name\u001b[0m\n",
421
- "Full Context:\n",
422
- "\u001b[32;1m\u001b[1;3m[]\u001b[0m\n",
423
- "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
424
- "\n",
425
- "\u001b[1m> Finished chain.\u001b[0m\n"
426
- ]
427
- },
428
- {
429
- "data": {
430
- "text/plain": [
431
- "{'query': 'What are the different treatment strategies for cancer?',\n",
432
- " 'result': \"I'm sorry, but I don't have the information to answer that question.\"}"
433
- ]
434
- },
435
- "execution_count": 23,
436
- "metadata": {},
437
- "output_type": "execute_result"
438
- }
439
- ],
440
- "source": [
441
- "cypher_chain.invoke({\"query\": \"What are the different treatment strategies for cancer?\"})"
442
- ]
443
- },
444
- {
445
- "cell_type": "code",
446
- "execution_count": null,
447
- "metadata": {},
448
- "outputs": [],
449
- "source": []
450
- }
451
- ],
452
- "metadata": {
453
- "kernelspec": {
454
- "display_name": "my_project_env",
455
- "language": "python",
456
- "name": "python3"
457
- },
458
- "language_info": {
459
- "codemirror_mode": {
460
- "name": "ipython",
461
- "version": 3
462
- },
463
- "file_extension": ".py",
464
- "mimetype": "text/x-python",
465
- "name": "python",
466
- "nbconvert_exporter": "python",
467
- "pygments_lexer": "ipython3",
468
- "version": "3.9.19"
469
- }
470
- },
471
- "nbformat": 4,
472
- "nbformat_minor": 2
473
- }