|
import kuzu |
|
import logging |
|
import sys |
|
import os |
|
import rdflib |
|
from rdflib import Graph, Literal, RDF, URIRef |
|
from rdflib.namespace import FOAF, XSD, Namespace |
|
|
|
from llama_index.graph_stores import KuzuGraphStore |
|
from llama_index import ( |
|
SimpleDirectoryReader, |
|
ServiceContext, |
|
KnowledgeGraphIndex, |
|
) |
|
from llama_index.readers import SimpleWebPageReader |
|
from llama_index.indices.loading import load_index_from_storage |
|
|
|
from llama_index.llms import OpenAI |
|
from IPython.display import Markdown, display |
|
from llama_index.storage.storage_context import StorageContext |
|
|
|
from pyvis.network import Network |
|
import pandas as pd |
|
import numpy as np |
|
import plotly.express as px |
|
import umap |
|
|
|
def make_dir(): |
|
if(not os.path.exists("data")): |
|
os.mkdir('data') |
|
|
|
|
|
def save_uploadedfile(uploadedfile): |
|
with open(os.path.join("data",uploadedfile.name),"wb") as f: |
|
f.write(uploadedfile.getbuffer()) |
|
|
|
def load_index(token, name, base_url): |
|
os.environ["OPENAI_API_KEY"] = token |
|
os.environ["OPENAI_API_BASE"] = base_url |
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
|
|
|
db = kuzu.Database(name+"/kg") |
|
graph_store = KuzuGraphStore(db) |
|
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) |
|
|
|
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) |
|
storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=name+"/storage") |
|
index = load_index_from_storage(storage_context=storage_context,service_context=service_context) |
|
return index |
|
|
|
|
|
def get_index_pdf(token, name, base_url): |
|
documents = SimpleDirectoryReader("./data").load_data() |
|
print(documents) |
|
print(documents) |
|
os.mkdir(name) |
|
os.environ["OPENAI_API_KEY"] = token |
|
os.environ["OPENAI_API_BASE"] = base_url |
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
|
|
|
db = kuzu.Database(name+"/kg") |
|
graph_store = KuzuGraphStore(db) |
|
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) |
|
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) |
|
storage_context = StorageContext.from_defaults(graph_store=graph_store) |
|
|
|
index = KnowledgeGraphIndex.from_documents(documents=documents, |
|
max_triplets_per_chunk=2, |
|
storage_context=storage_context, |
|
service_context=service_context, |
|
show_progress=True, |
|
include_embeddings=True) |
|
index.storage_context.persist(name+"/storage") |
|
|
|
|
|
return index |
|
|
|
def get_index(links, token, name, base_url): |
|
os.mkdir(name) |
|
os.environ["OPENAI_API_KEY"] = token |
|
os.environ["OPENAI_API_BASE"] = base_url |
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
|
|
|
db = kuzu.Database(name+"/kg") |
|
graph_store = KuzuGraphStore(db) |
|
|
|
|
|
documents = SimpleWebPageReader(html_to_text=True).load_data( |
|
links |
|
) |
|
|
|
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) |
|
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) |
|
storage_context = StorageContext.from_defaults(graph_store=graph_store) |
|
|
|
|
|
index = KnowledgeGraphIndex.from_documents(documents=documents, |
|
max_triplets_per_chunk=2, |
|
storage_context=storage_context, |
|
service_context=service_context, |
|
show_progress=True, |
|
include_embeddings=True) |
|
index.storage_context.persist(name+"/storage") |
|
|
|
|
|
return index |
|
|
|
def get_network_graph(index): |
|
g = index.get_networkx_graph() |
|
net = Network(directed=True) |
|
net.from_nx(g) |
|
|
|
net.save_graph("kuzugraph_draw3.html") |
|
|
|
|
|
def get_embeddings(index): |
|
embeddings = index.index_struct.to_dict() |
|
embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict'] |
|
embeddings_df = embeddings_df.dropna() |
|
return embeddings_df |
|
|
|
|
|
def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2): |
|
|
|
embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))]) |
|
|
|
|
|
umap_embedded = umap.UMAP( |
|
n_neighbors=n_neighbors, |
|
min_dist=min_dist, |
|
n_components=n_components, |
|
random_state=42, |
|
).fit_transform(embedding_df.values) |
|
|
|
|
|
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2']) |
|
umap_df['Label'] = embedding_series.index |
|
|
|
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings') |
|
return fig |
|
|
|
def generate_rdf(index): |
|
g = Graph() |
|
|
|
|
|
EX = Namespace("http://example.com/") |
|
|
|
|
|
for node in index.index_struct.node_dict.values(): |
|
subject = EX[str(node.node_id)] |
|
|
|
|
|
g.add((subject, RDF.type, EX["Node"])) |
|
g.add((subject, EX["text"], Literal(node.text))) |
|
|
|
|
|
for relationship in node.relationships: |
|
predicate = EX[relationship.predicate] |
|
object_node = EX[str(relationship.object_id)] |
|
g.add((subject, predicate, object_node)) |
|
|
|
return g |
|
|
|
def visualize_rdf(rdf_graph): |
|
|
|
|
|
rdf_string = rdf_graph.serialize(format="turtle").decode("utf-8") |
|
return rdf_string |
|
def query_model(index,user_query): |
|
query_engine = index.as_query_engine( |
|
include_text=True, |
|
response_mode="tree_summarize", |
|
embedding_mode="hybrid", |
|
similarity_top_k=5, |
|
) |
|
|
|
response = query_engine.query(user_query) |
|
return response.response |