# Create AI-Tutor vector database

In [None]:
import os

# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./ai-tutor-db")
chroma_collection = chroma_client.create_collection("ai-tutor-db")

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
import json
from llama_index.core.schema import TextNode


def load_jsonl_create_nodes(filepath):
    nodes = []  # List to hold the created node objects
    with open(filepath, "r") as file:
        for line in file:
            # Load each line as a JSON object
            json_obj = json.loads(line)
            # Extract required information
            title = json_obj.get("title")
            url = json_obj.get("url")
            content = json_obj.get("content")
            source = json_obj.get("source")
            # Create a TextNode object and append to the list
            node = TextNode(
                text=content,
                metadata={"title": title, "url": url, "source": source},
                excluded_embed_metadata_keys=["title", "url", "source"],
                excluded_llm_metadata_keys=["title", "url", "source"],
            )
            nodes.append(node)
    return nodes

In [None]:
filepath = "../data/ai-tutor-csv-files/combined_data_lines.jsonl"
nodes = load_jsonl_create_nodes(filepath)

print(f"Loaded {len(nodes)} nodes/chunks from the JSONL file\n ")

node = nodes[0]
print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

print("\n")

node = nodes[-10000]
print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex

# embeds = OpenAIEmbedding(model="text-embedding-3-small", mode="similarity")
# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="similarity")
embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# embeds = OpenAIEmbedding(model="text-embedding-ada-002", mode="similarity")

# Build index / generate embeddings using OpenAI.
index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=embeds, insert_batch_size=3000,)

In [None]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0, model="gpt-3.5-turbo-0125", max_tokens=None)
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)

In [None]:
res = query_engine.query("What is the LLama model?")

In [None]:
res.response

In [None]:
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("Text\t", src.text)
  print("Score\t", src.score)
  print("Metadata\t", src.metadata) 
  print("-_"*20)

# Load DB from disk

In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
# Create your index
db2 = chromadb.PersistentClient(path="./ai-tutor-db")
chroma_collection = db2.get_or_create_collection("ai-tutor-db")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
# Create your index
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.vector_stores import (
    ExactMatchFilter,
    MetadataFilters,
    MetadataFilter,
    FilterOperator,
    FilterCondition,
)


filters = MetadataFilters(
    filters=[
        MetadataFilter(key="source", value="lanchain_course"),
        MetadataFilter(key="source", value="langchain_docs"),
    ],
    condition=FilterCondition.OR,
)

llm = OpenAI(temperature=0, model="gpt-3.5-turbo-0125", max_tokens=None)
embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# query_engine = index.as_query_engine(
#     llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters
# )
query_engine = index.as_query_engine(
    llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True,
)

In [None]:
res = query_engine.query("What is the LLama model?")

# history = ""   
# for token in res.response_gen:
#     history += token
#     print(history)

In [None]:
res.response

In [None]:
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Source\t", src.metadata['source'])
  print("Title\t", src.metadata['title'])
  print("Text\t", src.text)
  print("Score\t", src.score)
  print("-_"*20)

In [None]:
from IPython.display import Markdown, display
# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [None]:
prompts_dict = query_engine.get_prompts()

In [None]:
display_prompt_dict(prompts_dict)