import kuzu import logging import sys import os import rdflib from rdflib import Graph, Literal, RDF, URIRef from rdflib.namespace import FOAF, XSD, Namespace #import llama_index from llama_index.graph_stores import KuzuGraphStore from llama_index import ( SimpleDirectoryReader, ServiceContext, KnowledgeGraphIndex, ) from llama_index.readers import SimpleWebPageReader from llama_index.indices.loading import load_index_from_storage from llama_index.llms import OpenAI from IPython.display import Markdown, display from llama_index.storage.storage_context import StorageContext from pyvis.network import Network import pandas as pd import numpy as np import plotly.express as px import umap def make_dir(): if(not os.path.exists("data")): os.mkdir('data') def save_uploadedfile(uploadedfile): with open(os.path.join("data",uploadedfile.name),"wb") as f: f.write(uploadedfile.getbuffer()) def load_index(token, name, base_url): os.environ["OPENAI_API_KEY"] = token os.environ["OPENAI_API_BASE"] = base_url logging.basicConfig(stream=sys.stdout, level=logging.INFO) db = kuzu.Database(name+"/kg") graph_store = KuzuGraphStore(db) llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=name+"/storage") index = load_index_from_storage(storage_context=storage_context,service_context=service_context) return index def get_index_pdf(token, name, base_url): documents = SimpleDirectoryReader("./data").load_data() print(documents) print(documents) os.mkdir(name) os.environ["OPENAI_API_KEY"] = token os.environ["OPENAI_API_BASE"] = base_url logging.basicConfig(stream=sys.stdout, level=logging.INFO) db = kuzu.Database(name+"/kg") graph_store = KuzuGraphStore(db) llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) storage_context = StorageContext.from_defaults(graph_store=graph_store) index = KnowledgeGraphIndex.from_documents(documents=documents, max_triplets_per_chunk=2, storage_context=storage_context, service_context=service_context, show_progress=True, include_embeddings=True) index.storage_context.persist(name+"/storage") return index def get_index(links, token, name, base_url): os.mkdir(name) os.environ["OPENAI_API_KEY"] = token os.environ["OPENAI_API_BASE"] = base_url logging.basicConfig(stream=sys.stdout, level=logging.INFO) db = kuzu.Database(name+"/kg") graph_store = KuzuGraphStore(db) documents = SimpleWebPageReader(html_to_text=True).load_data( links ) llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) storage_context = StorageContext.from_defaults(graph_store=graph_store) # NOTE: can take a while! index = KnowledgeGraphIndex.from_documents(documents=documents, max_triplets_per_chunk=2, storage_context=storage_context, service_context=service_context, show_progress=True, include_embeddings=True) index.storage_context.persist(name+"/storage") return index def get_network_graph(index): g = index.get_networkx_graph() net = Network(directed=True) net.from_nx(g) # net.show("kuzugraph_draw3.html") net.save_graph("kuzugraph_draw3.html") def get_embeddings(index): embeddings = index.index_struct.to_dict() embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict'] embeddings_df = embeddings_df.dropna() return embeddings_df def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2): # Convert Series to DataFrame embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))]) # Perform UMAP dimensionality reduction umap_embedded = umap.UMAP( n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=42, ).fit_transform(embedding_df.values) # Plot the UMAP embedding umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2']) umap_df['Label'] = embedding_series.index # Plot the UMAP embedding using Plotly Express fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings') return fig def generate_rdf(index): g = Graph() # Define namespace prefixes EX = Namespace("http://example.com/") # Iterate over the nodes in the index for node in index.index_struct.node_dict.values(): subject = EX[str(node.node_id)] # Add triples for node properties g.add((subject, RDF.type, EX["Node"])) g.add((subject, EX["text"], Literal(node.text))) # Add triples for node relationships for relationship in node.relationships: predicate = EX[relationship.predicate] object_node = EX[str(relationship.object_id)] g.add((subject, predicate, object_node)) return g def visualize_rdf(rdf_graph): # Visualize the RDF graph (you can use a library like PyVis or D3.js) # For simplicity, let's serialize the RDF graph to a string rdf_string = rdf_graph.serialize(format="turtle").decode("utf-8") return rdf_string def query_model(index,user_query): query_engine = index.as_query_engine( include_text=True, response_mode="tree_summarize", embedding_mode="hybrid", similarity_top_k=5, ) response = query_engine.query(user_query) return response.response