Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

Omar Solano commited on Sep 25, 2024

Commit

9c1e8a7

1 Parent(s): 471ad41

update gradio chatbot to latest version

Browse files

Files changed (6) hide show

README.md +1 -1
requirements.txt +101 -169
scripts/custom_retriever.py +237 -28
scripts/main.py +82 -43
scripts/prompts.py +116 -14
scripts/setup.py +122 -34

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🧑🏻‍🏫
 colorFrom: gray
 colorTo: pink
 sdk: gradio
-sdk_version: 4.42.0
 app_file: scripts/main.py
 pinned: false
 ---

 colorFrom: gray
 colorTo: pink
 sdk: gradio
+sdk_version: 4.44.0
 app_file: scripts/main.py
 pinned: false
 ---

requirements.txt CHANGED Viewed

@@ -1,280 +1,212 @@
 aiofiles==23.2.1
 aiohappyeyeballs==2.4.0
-aiohttp==3.10.5
 aiosignal==1.3.1
 annotated-types==0.7.0
-anyio==4.4.0
 appnope==0.1.4
 asgiref==3.8.1
 asttokens==2.4.1
 attrs==24.2.0
-automat==24.8.1
-azure-core==1.30.2
-azure-identity==1.17.1
 backoff==2.2.1
 bcrypt==4.2.0
 beautifulsoup4==4.12.3
-bleach==6.1.0
-boto3==1.35.5
-botocore==1.35.5
-build==1.2.1
 cachetools==5.5.0
-certifi==2024.7.4
-cffi==1.17.0
 charset-normalizer==3.3.2
 chroma-hnswlib==0.7.6
-chromadb==0.5.5
 click==8.1.7
-cohere==5.8.1
 coloredlogs==15.0.1
 comm==0.2.2
-constantly==23.10.4
-contourpy==1.2.1
-cryptography==43.0.0
-cssselect==1.2.0
 cycler==0.12.1
 dataclasses-json==0.6.7
 debugpy==1.8.5
 decorator==5.1.1
-defusedxml==0.7.1
 deprecated==1.2.14
 dirtyjson==1.0.8
 distro==1.9.0
 dnspython==2.6.1
-docstring-parser==0.16
-executing==2.0.1
-fastapi==0.112.2
-fastavro==1.9.5
-fastjsonschema==2.20.0
 ffmpy==0.4.0
-filelock==3.15.4
 flatbuffers==24.3.25
-fonttools==4.53.1
 frozenlist==1.4.1
-fsspec==2024.6.1
 google-ai-generativelanguage==0.6.4
-google-api-core==2.19.1
-google-api-python-client==2.142.0
-google-auth==2.34.0
 google-auth-httplib2==0.2.0
-google-cloud-aiplatform==1.63.0
-google-cloud-bigquery==3.25.0
-google-cloud-core==2.4.1
-google-cloud-resource-manager==1.12.5
-google-cloud-storage==2.18.2
-google-crc32c==1.5.0
 google-generativeai==0.5.4
-google-resumable-media==2.7.2
-googleapis-common-protos==1.64.0
-gradio==4.42.0
 gradio-client==1.3.0
-greenlet==3.0.3
-grpc-google-iam-v1==0.13.1
-grpcio==1.66.0
 grpcio-status==1.62.3
 h11==0.14.0
 httpcore==1.0.5
 httplib2==0.22.0
 httptools==0.6.1
-httpx==0.27.0
 httpx-sse==0.4.0
-huggingface-hub==0.24.6
 humanfriendly==10.0
-hyperlink==21.0.0
-idna==3.8
-importlib-metadata==8.0.0
-importlib-resources==6.4.4
-incremental==24.7.2
-instructor==1.3.4
 ipykernel==6.29.5
-ipython==8.26.0
-itemadapter==0.9.0
-itemloaders==1.3.1
 jedi==0.19.1
 jinja2==3.1.4
-jiter==0.4.2
 jmespath==1.0.1
 joblib==1.4.2
-jsonpatch==1.33
-jsonpath-python==1.0.6
-jsonpointer==3.0.0
-jsonschema==4.23.0
-jsonschema-specifications==2023.12.1
-jupyter-client==8.6.2
 jupyter-core==5.7.2
-jupyterlab-pygments==0.3.0
-kiwisolver==1.4.5
-kubernetes==30.1.0
-langchain==0.2.14
-langchain-chroma==0.1.2
-langchain-core==0.2.35
-langchain-openai==0.1.22
-langchain-text-splitters==0.2.2
-langsmith==0.1.104
-llama-cloud==0.0.15
-llama-index==0.11.1
-llama-index-agent-openai==0.3.0
-llama-index-cli==0.3.0
-llama-index-core==0.11.1
-llama-index-embeddings-adapter==0.2.1
-llama-index-embeddings-cohere==0.2.0
-llama-index-embeddings-huggingface==0.3.1
-llama-index-embeddings-openai==0.2.3
-llama-index-finetuning==0.2.0
-llama-index-indices-managed-llama-cloud==0.3.0
 llama-index-legacy==0.9.48.post3
-llama-index-llms-azure-openai==0.2.0
-llama-index-llms-gemini==0.3.4
-llama-index-llms-mistralai==0.2.1
-llama-index-llms-openai==0.2.0
-llama-index-llms-replicate==0.2.0
-llama-index-multi-modal-llms-openai==0.2.0
-llama-index-postprocessor-cohere-rerank==0.2.0
 llama-index-program-openai==0.2.0
 llama-index-question-gen-openai==0.2.0
-llama-index-readers-file==0.2.0
-llama-index-readers-llama-parse==0.2.0
 llama-index-vector-stores-chroma==0.2.0
-llama-parse==0.5.0
-logfire==0.51.0
-lxml==5.3.0
 markdown-it-py==3.0.0
 markupsafe==2.1.5
 marshmallow==3.22.0
 matplotlib==3.9.2
 matplotlib-inline==0.1.7
 mdurl==0.1.2
-minijinja==2.0.1
-mistralai==1.0.2
-mistune==3.0.2
-mmh3==4.1.0
 monotonic==1.6
 mpmath==1.3.0
-msal==1.30.0
-msal-extensions==1.2.0
-multidict==6.0.5
 mypy-extensions==1.0.0
-nbclient==0.10.0
-nbconvert==7.16.4
-nbformat==5.10.4
 nest-asyncio==1.6.0
 networkx==3.3
 nltk==3.9.1
 numpy==1.26.4
 oauthlib==3.2.2
-onnxruntime==1.19.0
-openai==1.42.0
-opentelemetry-api==1.26.0
-opentelemetry-exporter-otlp-proto-common==1.26.0
-opentelemetry-exporter-otlp-proto-grpc==1.26.0
-opentelemetry-exporter-otlp-proto-http==1.26.0
-opentelemetry-instrumentation==0.47b0
-opentelemetry-instrumentation-asgi==0.47b0
-opentelemetry-instrumentation-fastapi==0.47b0
-opentelemetry-proto==1.26.0
-opentelemetry-sdk==1.26.0
-opentelemetry-semantic-conventions==0.47b0
-opentelemetry-util-http==0.47b0
 orjson==3.10.7
 overrides==7.7.0
 packaging==24.1
-pandas==2.2.2
-pandocfilters==1.5.1
 parameterized==0.9.0
-parsel==1.9.1
 parso==0.8.4
 pexpect==4.9.0
 pillow==10.4.0
-platformdirs==4.2.2
-portalocker==2.10.1
-posthog==3.5.2
 prompt-toolkit==3.0.47
-protego==0.3.1
 proto-plus==1.24.0
-protobuf==4.25.4
 psutil==6.0.0
 ptyprocess==0.7.0
 pure-eval==0.2.3
-pyasn1==0.6.0
-pyasn1-modules==0.4.0
-pycparser==2.22
-pydantic==2.8.2
-pydantic-core==2.20.1
-pydispatcher==2.0.7
 pydub==0.25.1
 pygments==2.18.0
-pyjwt==2.9.0
-pymongo==4.8.0
-pyopenssl==24.2.1
 pyparsing==3.1.4
 pypdf==4.3.1
 pypika==0.48.9
 pyproject-hooks==1.1.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
-python-multipart==0.0.9
-pytz==2024.1
 pyyaml==6.0.2
 pyzmq==26.2.0
-queuelib==1.7.0
-referencing==0.35.1
-regex==2024.7.24
 requests==2.32.3
-requests-file==2.1.0
 requests-oauthlib==2.0.0
-rich==13.8.0
-rpds-py==0.20.0
 rsa==4.9
-ruff==0.6.2
 s3transfer==0.10.2
-safetensors==0.4.4
-scikit-learn==1.5.1
-scipy==1.14.1
-scrapy==2.11.2
 semantic-version==2.10.0
-sentence-transformers==2.7.0
-service-identity==24.1.0
-setuptools==73.0.1
-shapely==2.0.6
 shellingham==1.5.4
 six==1.16.0
 sniffio==1.3.1
 soupsieve==2.6
-sqlalchemy==2.0.32
 stack-data==0.6.3
-starlette==0.38.2
 striprtf==0.0.26
-sympy==1.13.2
-tabulate==0.9.0
-tenacity==8.3.0
-threadpoolctl==3.5.0
 tiktoken==0.7.0
-tinycss2==1.3.0
-tldextract==5.1.2
-tokenizers==0.19.1
 tomlkit==0.12.0
-torch==2.4.0
 tornado==6.4.1
 tqdm==4.66.5
 traitlets==5.14.3
-transformers==4.44.2
-twisted==24.7.0
 typer==0.12.5
-types-requests==2.32.0.20240712
 typing-extensions==4.12.2
 typing-inspect==0.9.0
-tzdata==2024.1
 uritemplate==4.1.1
-urllib3==2.2.2
 uvicorn==0.30.6
 uvloop==0.20.0
-w3lib==2.2.1
-watchfiles==0.23.0
 wcwidth==0.2.13
-webencodings==0.5.1
 websocket-client==1.8.0
 websockets==12.0
 wrapt==1.16.0
-yarl==1.9.4
-zipp==3.20.0
-zope-interface==7.0.1

 aiofiles==23.2.1
 aiohappyeyeballs==2.4.0
+aiohttp==3.10.6
 aiosignal==1.3.1
+aiostream==0.5.2
 annotated-types==0.7.0
+anyio==4.6.0
 appnope==0.1.4
 asgiref==3.8.1
 asttokens==2.4.1
 attrs==24.2.0
 backoff==2.2.1
 bcrypt==4.2.0
 beautifulsoup4==4.12.3
+boto3==1.35.26
+botocore==1.35.26
+build==1.2.2
 cachetools==5.5.0
+certifi==2024.8.30
 charset-normalizer==3.3.2
 chroma-hnswlib==0.7.6
+chromadb==0.5.7
 click==8.1.7
+cohere==5.9.4
 coloredlogs==15.0.1
 comm==0.2.2
+contourpy==1.3.0
 cycler==0.12.1
 dataclasses-json==0.6.7
 debugpy==1.8.5
 decorator==5.1.1
 deprecated==1.2.14
 dirtyjson==1.0.8
 distro==1.9.0
 dnspython==2.6.1
+durationpy==0.7
+executing==2.1.0
+fastapi==0.115.0
+fastavro==1.9.7
 ffmpy==0.4.0
+filelock==3.16.1
 flatbuffers==24.3.25
+fonttools==4.54.1
 frozenlist==1.4.1
+fsspec==2024.9.0
 google-ai-generativelanguage==0.6.4
+google-api-core==2.20.0
+google-api-python-client==2.146.0
+google-auth==2.35.0
 google-auth-httplib2==0.2.0
 google-generativeai==0.5.4
+googleapis-common-protos==1.65.0
+gradio==4.44.0
 gradio-client==1.3.0
+greenlet==3.1.1
+grpcio==1.66.1
 grpcio-status==1.62.3
+grpclib==0.4.7
 h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
 httpcore==1.0.5
 httplib2==0.22.0
 httptools==0.6.1
+httpx==0.27.2
 httpx-sse==0.4.0
+huggingface-hub==0.25.1
 humanfriendly==10.0
+hyperframe==6.0.1
+idna==3.10
+importlib-metadata==8.4.0
+importlib-resources==6.4.5
 ipykernel==6.29.5
+ipython==8.27.0
 jedi==0.19.1
 jinja2==3.1.4
+jiter==0.5.0
 jmespath==1.0.1
 joblib==1.4.2
+jupyter-client==8.6.3
 jupyter-core==5.7.2
+kiwisolver==1.4.7
+kubernetes==31.0.0
+llama-cloud==0.1.0
+llama-index==0.11.13
+llama-index-agent-openai==0.3.4
+llama-index-cli==0.3.1
+llama-index-core==0.11.13.post1
+llama-index-embeddings-cohere==0.2.1
+llama-index-embeddings-openai==0.2.5
+llama-index-indices-managed-llama-cloud==0.4.0
 llama-index-legacy==0.9.48.post3
+llama-index-llms-gemini==0.3.5
+llama-index-llms-openai==0.2.9
+llama-index-multi-modal-llms-openai==0.2.1
+llama-index-postprocessor-cohere-rerank==0.2.1
 llama-index-program-openai==0.2.0
 llama-index-question-gen-openai==0.2.0
+llama-index-readers-file==0.2.2
+llama-index-readers-llama-parse==0.3.0
 llama-index-vector-stores-chroma==0.2.0
+llama-parse==0.5.6
+logfire==0.53.0
 markdown-it-py==3.0.0
 markupsafe==2.1.5
 marshmallow==3.22.0
 matplotlib==3.9.2
 matplotlib-inline==0.1.7
 mdurl==0.1.2
+mmh3==5.0.1
+modal==0.64.136
 monotonic==1.6
 mpmath==1.3.0
+multidict==6.1.0
 mypy-extensions==1.0.0
 nest-asyncio==1.6.0
 networkx==3.3
 nltk==3.9.1
 numpy==1.26.4
 oauthlib==3.2.2
+onnxruntime==1.19.2
+openai==1.47.1
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-exporter-otlp-proto-http==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
 orjson==3.10.7
 overrides==7.7.0
 packaging==24.1
+pandas==2.2.3
 parameterized==0.9.0
 parso==0.8.4
 pexpect==4.9.0
 pillow==10.4.0
+platformdirs==4.3.6
+posthog==3.6.6
 prompt-toolkit==3.0.47
 proto-plus==1.24.0
+protobuf==4.25.5
 psutil==6.0.0
 ptyprocess==0.7.0
 pure-eval==0.2.3
+pyasn1==0.6.1
+pyasn1-modules==0.4.1
+pydantic==2.9.2
+pydantic-core==2.23.4
 pydub==0.25.1
 pygments==2.18.0
+pymongo==4.9.1
 pyparsing==3.1.4
 pypdf==4.3.1
 pypika==0.48.9
 pyproject-hooks==1.1.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
+python-multipart==0.0.10
+pytz==2024.2
 pyyaml==6.0.2
 pyzmq==26.2.0
+regex==2024.9.11
 requests==2.32.3
 requests-oauthlib==2.0.0
+rich==13.8.1
 rsa==4.9
+ruff==0.6.7
 s3transfer==0.10.2
 semantic-version==2.10.0
+setuptools==75.1.0
 shellingham==1.5.4
+sigtools==4.0.1
 six==1.16.0
 sniffio==1.3.1
 soupsieve==2.6
+sqlalchemy==2.0.35
 stack-data==0.6.3
+starlette==0.38.6
 striprtf==0.0.26
+sympy==1.13.3
+synchronicity==0.7.6
+tenacity==8.5.0
 tiktoken==0.7.0
+tokenizers==0.20.0
+toml==0.10.2
 tomlkit==0.12.0
 tornado==6.4.1
 tqdm==4.66.5
 traitlets==5.14.3
 typer==0.12.5
+types-certifi==2021.10.8.3
+types-requests==2.32.0.20240914
+types-toml==0.10.8.20240310
 typing-extensions==4.12.2
 typing-inspect==0.9.0
+tzdata==2024.2
 uritemplate==4.1.1
+urllib3==2.2.3
 uvicorn==0.30.6
 uvloop==0.20.0
+watchfiles==0.24.0
 wcwidth==0.2.13
 websocket-client==1.8.0
 websockets==12.0
 wrapt==1.16.0
+yarl==1.12.1
+zipp==3.20.2

scripts/custom_retriever.py CHANGED Viewed

@@ -1,11 +1,75 @@
 import time
-from typing import List
 import logfire
 from llama_index.core import QueryBundle
 from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
-from llama_index.core.schema import NodeWithScore, TextNode
 from llama_index.postprocessor.cohere_rerank import CohereRerank
 class CustomRetriever(BaseRetriever):
@@ -15,41 +79,64 @@ class CustomRetriever(BaseRetriever):
         self,
         vector_retriever: VectorIndexRetriever,
         document_dict: dict,
     ) -> None:
         """Init params."""
         self._vector_retriever = vector_retriever
         self._document_dict = document_dict
         super().__init__()
-    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
         """Retrieve nodes given query."""
         # LlamaIndex adds "\ninput is " to the query string
         query_bundle.query_str = query_bundle.query_str.replace("\ninput is ", "")
         query_bundle.query_str = query_bundle.query_str.rstrip()
-        logfire.info(f"Retrieving 10 nodes with string: '{query_bundle}'")
         start = time.time()
-        nodes = self._vector_retriever.retrieve(query_bundle)
-        duration = time.time() - start
-        logfire.info(f"Retrieving nodes took {duration:.2f}s")
         # Filter out nodes with the same ref_doc_id
         def filter_nodes_by_unique_doc_id(nodes):
             unique_nodes = {}
             for node in nodes:
-                doc_id = node.node.ref_doc_id
                 if doc_id is not None and doc_id not in unique_nodes:
                     unique_nodes[doc_id] = node
             return list(unique_nodes.values())
         nodes = filter_nodes_by_unique_doc_id(nodes)
-        logfire.info(
-            f"Number of nodes after filtering the ones with same ref_doc_id: {len(nodes)}"
-        )
-        logfire.info(f"Nodes retrieved: {nodes}")
         nodes_context = []
         for node in nodes:
@@ -59,32 +146,154 @@ class CustomRetriever(BaseRetriever):
             # print("Score\t", node.score)
             # print("Metadata\t", node.metadata)
             # print("-_" * 20)
-            if node.score < 0.2:
-                continue
             if node.metadata["retrieve_doc"] == True:
                 # print("This node will be replaced by the document")
-                doc = self._document_dict[node.node.ref_doc_id]
                 # print(doc.text)
                 new_node = NodeWithScore(
-                    node=TextNode(text=doc.text, metadata=node.metadata),  # type: ignore
                     score=node.score,
                 )
                 nodes_context.append(new_node)
             else:
                 nodes_context.append(node)
         try:
-            reranker = CohereRerank(top_n=5, model="rerank-english-v3.0")
             nodes_context = reranker.postprocess_nodes(nodes_context, query_bundle)
-            nodes_filtered = []
-            for node in nodes_context:
-                if node.score < 0.10:  # type: ignore
-                    continue
-                else:
-                    nodes_filtered.append(node)
-            logfire.info(f"Cohere raranking to {len(nodes_filtered)} nodes")
-            return nodes_filtered
         except Exception as e:
-            logfire.error(f"Error reranking nodes with Cohere: {e}")
-            return nodes_context

+import asyncio
 import time
+import traceback
+from typing import List, Optional
 import logfire
+import tiktoken
+from cohere import AsyncClient
 from llama_index.core import QueryBundle
+from llama_index.core.async_utils import run_async_tasks
+from llama_index.core.callbacks import CBEventType, EventPayload
 from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
+from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle, TextNode
 from llama_index.postprocessor.cohere_rerank import CohereRerank
+from llama_index.postprocessor.cohere_rerank.base import CohereRerank
+class AsyncCohereRerank(CohereRerank):
+    def __init__(
+        self,
+        top_n: int = 5,
+        model: str = "rerank-english-v3.0",
+        api_key: Optional[str] = None,
+    ) -> None:
+        super().__init__(top_n=top_n, model=model, api_key=api_key)
+        self._api_key = api_key
+        self._model = model
+        self._top_n = top_n
+    async def apostprocess_nodes(
+        self,
+        nodes: List[NodeWithScore],
+        query_bundle: Optional[QueryBundle] = None,
+    ) -> List[NodeWithScore]:
+        if query_bundle is None:
+            raise ValueError("Query bundle must be provided.")
+        if len(nodes) == 0:
+            return []
+        async_client = AsyncClient(api_key=self._api_key)
+        with self.callback_manager.event(
+            CBEventType.RERANKING,
+            payload={
+                EventPayload.NODES: nodes,
+                EventPayload.MODEL_NAME: self._model,
+                EventPayload.QUERY_STR: query_bundle.query_str,
+                EventPayload.TOP_K: self._top_n,
+            },
+        ) as event:
+            texts = [
+                node.node.get_content(metadata_mode=MetadataMode.EMBED)
+                for node in nodes
+            ]
+            results = await async_client.rerank(
+                model=self._model,
+                top_n=self._top_n,
+                query=query_bundle.query_str,
+                documents=texts,
+            )
+            new_nodes = []
+            for result in results.results:
+                new_node_with_score = NodeWithScore(
+                    node=nodes[result.index].node, score=result.relevance_score
+                )
+                new_nodes.append(new_node_with_score)
+            event.on_end(payload={EventPayload.NODES: new_nodes})
+        return new_nodes
 class CustomRetriever(BaseRetriever):
         self,
         vector_retriever: VectorIndexRetriever,
         document_dict: dict,
+        keyword_retriever,
+        mode: str = "AND",
     ) -> None:
         """Init params."""
         self._vector_retriever = vector_retriever
         self._document_dict = document_dict
+        self._keyword_retriever = keyword_retriever
+        if mode not in ("AND", "OR"):
+            raise ValueError("Invalid mode.")
+        self._mode = mode
         super().__init__()
+    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
         """Retrieve nodes given query."""
         # LlamaIndex adds "\ninput is " to the query string
         query_bundle.query_str = query_bundle.query_str.replace("\ninput is ", "")
         query_bundle.query_str = query_bundle.query_str.rstrip()
+        # logfire.info(f"Retrieving nodes with string: '{query_bundle}'")
         start = time.time()
+        nodes = await self._vector_retriever.aretrieve(query_bundle)
+        keyword_nodes = await self._keyword_retriever.aretrieve(query_bundle)
+        # logfire.info(f"Number of vector nodes: {len(nodes)}")
+        # logfire.info(f"Number of keyword nodes: {len(keyword_nodes)}")
+        vector_ids = {n.node.node_id for n in nodes}
+        keyword_ids = {n.node.node_id for n in keyword_nodes}
+        combined_dict = {n.node.node_id: n for n in nodes}
+        combined_dict.update({n.node.node_id: n for n in keyword_nodes})
+        if self._mode == "AND":
+            retrieve_ids = vector_ids.intersection(keyword_ids)
+        else:
+            retrieve_ids = vector_ids.union(keyword_ids)
+        nodes = [combined_dict[rid] for rid in retrieve_ids]
         # Filter out nodes with the same ref_doc_id
         def filter_nodes_by_unique_doc_id(nodes):
             unique_nodes = {}
             for node in nodes:
+                # doc_id = node.node.ref_doc_id
+                doc_id = node.node.source_node.node_id
                 if doc_id is not None and doc_id not in unique_nodes:
                     unique_nodes[doc_id] = node
             return list(unique_nodes.values())
         nodes = filter_nodes_by_unique_doc_id(nodes)
+        # logfire.info(
+        #     f"Number of nodes after filtering the ones with same ref_doc_id: {len(nodes)}"
+        # )
+        # logfire.info(f"Nodes retrieved: {nodes}")
         nodes_context = []
         for node in nodes:
             # print("Score\t", node.score)
             # print("Metadata\t", node.metadata)
             # print("-_" * 20)
+            doc_id = node.node.source_node.node_id  # type: ignore
             if node.metadata["retrieve_doc"] == True:
                 # print("This node will be replaced by the document")
+                # doc = self._document_dict[node.node.ref_doc_id]
+                # print("retrieved doc == True")
+                doc = self._document_dict[doc_id]
                 # print(doc.text)
                 new_node = NodeWithScore(
+                    node=TextNode(text=doc.text, metadata=node.metadata, id_=doc_id),  # type: ignore
+                    score=node.score,
+                )
+                nodes_context.append(new_node)
+            else:
+                node.node.node_id = doc_id
+                nodes_context.append(node)
+        try:
+            reranker = AsyncCohereRerank(top_n=3, model="rerank-english-v3.0")
+            nodes_context = await reranker.apostprocess_nodes(
+                nodes_context, query_bundle
+            )
+        except Exception as e:
+            error_msg = f"Error during reranking: {type(e).__name__}: {str(e)}\n"
+            error_msg += "Traceback:\n"
+            error_msg += traceback.format_exc()
+            logfire.error(error_msg)
+        nodes_filtered = []
+        total_tokens = 0
+        enc = tiktoken.encoding_for_model("gpt-4o-mini")
+        for node in nodes_context:
+            if node.score < 0.10:  # type: ignore
+                continue
+            # Count tokens
+            if "tokens" in node.node.metadata:
+                node_tokens = node.node.metadata["tokens"]
+            else:
+                node_tokens = len(enc.encode(node.node.text))  # type: ignore
+            if total_tokens + node_tokens > 100_000:
+                logfire.info("Skipping node due to token count exceeding 100k")
+                break
+            total_tokens += node_tokens
+            nodes_filtered.append(node)
+        # logfire.info(f"Final nodes to context {len(nodes_filtered)} nodes")
+        # logfire.info(f"Total tokens: {total_tokens}")
+        # duration = time.time() - start
+        # logfire.info(f"Retrieving nodes took {duration:.2f}s")
+        return nodes_filtered[:3]
+    # def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+    #     return asyncio.run(self._aretrieve(query_bundle))
+    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        """Retrieve nodes given query."""
+        # LlamaIndex adds "\ninput is " to the query string
+        query_bundle.query_str = query_bundle.query_str.replace("\ninput is ", "")
+        query_bundle.query_str = query_bundle.query_str.rstrip()
+        logfire.info(f"Retrieving nodes with string: '{query_bundle}'")
+        start = time.time()
+        nodes = self._vector_retriever.retrieve(query_bundle)
+        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
+        logfire.info(f"Number of vector nodes: {len(nodes)}")
+        logfire.info(f"Number of keyword nodes: {len(keyword_nodes)}")
+        vector_ids = {n.node.node_id for n in nodes}
+        keyword_ids = {n.node.node_id for n in keyword_nodes}
+        combined_dict = {n.node.node_id: n for n in nodes}
+        combined_dict.update({n.node.node_id: n for n in keyword_nodes})
+        if self._mode == "AND":
+            retrieve_ids = vector_ids.intersection(keyword_ids)
+        else:
+            retrieve_ids = vector_ids.union(keyword_ids)
+        nodes = [combined_dict[rid] for rid in retrieve_ids]
+        def filter_nodes_by_unique_doc_id(nodes):
+            unique_nodes = {}
+            for node in nodes:
+                # doc_id = node.node.ref_doc_id
+                doc_id = node.node.source_node.node_id
+                if doc_id is not None and doc_id not in unique_nodes:
+                    unique_nodes[doc_id] = node
+            return list(unique_nodes.values())
+        nodes = filter_nodes_by_unique_doc_id(nodes)
+        logfire.info(
+            f"Number of nodes after filtering the ones with same ref_doc_id: {len(nodes)}"
+        )
+        logfire.info(f"Nodes retrieved: {nodes}")
+        nodes_context = []
+        for node in nodes:
+            doc_id = node.node.source_node.node_id  # type: ignore
+            if node.metadata["retrieve_doc"] == True:
+                doc = self._document_dict[doc_id]
+                new_node = NodeWithScore(
+                    node=TextNode(text=doc.text, metadata=node.metadata, id_=doc_id),  # type: ignore
                     score=node.score,
                 )
                 nodes_context.append(new_node)
             else:
+                node.node.node_id = doc_id
                 nodes_context.append(node)
         try:
+            reranker = CohereRerank(top_n=3, model="rerank-english-v3.0")
             nodes_context = reranker.postprocess_nodes(nodes_context, query_bundle)
         except Exception as e:
+            error_msg = f"Error during reranking: {type(e).__name__}: {str(e)}\n"
+            error_msg += "Traceback:\n"
+            error_msg += traceback.format_exc()
+            logfire.error(error_msg)
+        nodes_filtered = []
+        total_tokens = 0
+        enc = tiktoken.encoding_for_model("gpt-4o-mini")
+        for node in nodes_context:
+            if node.score < 0.10:  # type: ignore
+                continue
+            if "tokens" in node.node.metadata:
+                node_tokens = node.node.metadata["tokens"]
+            else:
+                node_tokens = len(enc.encode(node.node.text))  # type: ignore
+            if total_tokens + node_tokens > 100_000:
+                logfire.info("Skipping node due to token count exceeding 100k")
+                break
+            total_tokens += node_tokens
+            nodes_filtered.append(node)
+        logfire.info(f"Final nodes to context {len(nodes_filtered)} nodes")
+        logfire.info(f"Total tokens: {total_tokens}")
+        duration = time.time() - start
+        logfire.info(f"Retrieving nodes took {duration:.2f}s")
+        return nodes_filtered[:3]

scripts/main.py CHANGED Viewed

@@ -6,53 +6,59 @@ from llama_index.agent.openai import OpenAIAgent
 from llama_index.core.llms import MessageRole
 from llama_index.core.memory import ChatSummaryMemoryBuffer
 from llama_index.core.tools import RetrieverTool, ToolMetadata
 from llama_index.llms.openai import OpenAI
 from prompts import system_message_openai_agent
-from setup import (
     AVAILABLE_SOURCES,
     AVAILABLE_SOURCES_UI,
     CONCURRENCY_COUNT,
-    custom_retriever_langchain,
-    custom_retriever_llama_index,
-    custom_retriever_openai_cookbooks,
-    custom_retriever_peft,
-    custom_retriever_transformers,
-    custom_retriever_trl,
 )
 def update_query_engine_tools(selected_sources):
     tools = []
     source_mapping = {
-        "Transformers Docs": (
-            custom_retriever_transformers,
-            "Transformers_information",
-            """Useful for general questions asking about the artificial intelligence (AI) field. Employ this tool to fetch information on topics such as language models (LLMs) models such as Llama3 and theory (transformer architectures), tips on prompting, quantization, etc.""",
-        ),
-        "PEFT Docs": (
-            custom_retriever_peft,
-            "PEFT_information",
-            """Useful for questions asking about efficient LLM fine-tuning. Employ this tool to fetch information on topics such as LoRA, QLoRA, etc.""",
-        ),
-        "TRL Docs": (
-            custom_retriever_trl,
-            "TRL_information",
-            """Useful for questions asking about fine-tuning LLMs with reinforcement learning (RLHF). Includes information about the Supervised Fine-tuning step (SFT), Reward Modeling step (RM), and the Proximal Policy Optimization (PPO) step.""",
-        ),
-        "LlamaIndex Docs": (
-            custom_retriever_llama_index,
-            "LlamaIndex_information",
-            """Useful for questions asking about retrieval augmented generation (RAG) with LLMs and embedding models. It is the documentation of a framework, includes info about fine-tuning embedding models, building chatbots, and agents with llms, using vector databases, embeddings, information retrieval with cosine similarity or bm25, etc.""",
-        ),
-        "OpenAI Cookbooks": (
-            custom_retriever_openai_cookbooks,
-            "openai_cookbooks_info",
-            """Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API,  and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
-        ),
-        "LangChain Docs": (
-            custom_retriever_langchain,
-            "langchain_info",
-            """Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""",
         ),
     }
@@ -80,9 +86,7 @@ def generate_completion(
     memory,
 ):
     with logfire.span("Running query"):
-        logfire.info(f"query: {query}")
-        logfire.info(f"model: {model}")
-        logfire.info(f"sources: {sources}")
         chat_list = memory.get()
@@ -102,7 +106,34 @@ def generate_completion(
         client = llm._get_client()
         logfire.instrument_openai(client)
-        query_engine_tools = update_query_engine_tools(sources)
         agent = OpenAIAgent.from_tools(
             llm=llm,
@@ -151,8 +182,16 @@ def format_sources(completion) -> str:
     )
     document_template: str = "[🔗 {source}: {title}]({url}), relevance: {score:2.2f}"
     all_documents = []
-    for source in completion.sources:
-        for src in source.raw_output:
             document = document_template.format(
                 title=src.metadata["title"],
                 score=src.score,
@@ -189,13 +228,13 @@ sources = gr.CheckboxGroup(
         "LlamaIndex Docs",
         "LangChain Docs",
         "OpenAI Cookbooks",
     ],
     interactive=True,
 )
 model = gr.Dropdown(
     [
         "gpt-4o-mini",
-        "gpt-4o",
     ],
     label="Model",
     value="gpt-4o-mini",

 from llama_index.core.llms import MessageRole
 from llama_index.core.memory import ChatSummaryMemoryBuffer
 from llama_index.core.tools import RetrieverTool, ToolMetadata
+from llama_index.core.vector_stores import (
+    FilterCondition,
+    FilterOperator,
+    MetadataFilter,
+    MetadataFilters,
+)
 from llama_index.llms.openai import OpenAI
 from prompts import system_message_openai_agent
+from setup import (  # custom_retriever_langchain,; custom_retriever_llama_index,; custom_retriever_openai_cookbooks,; custom_retriever_peft,; custom_retriever_transformers,; custom_retriever_trl,
     AVAILABLE_SOURCES,
     AVAILABLE_SOURCES_UI,
     CONCURRENCY_COUNT,
+    custom_retriever_all_sources,
 )
 def update_query_engine_tools(selected_sources):
     tools = []
     source_mapping = {
+        # "Transformers Docs": (
+        #     custom_retriever_transformers,
+        #     "Transformers_information",
+        #     """Useful for general questions asking about the artificial intelligence (AI) field. Employ this tool to fetch information on topics such as language models (LLMs) models such as Llama3 and theory (transformer architectures), tips on prompting, quantization, etc.""",
+        # ),
+        # "PEFT Docs": (
+        #     custom_retriever_peft,
+        #     "PEFT_information",
+        #     """Useful for questions asking about efficient LLM fine-tuning. Employ this tool to fetch information on topics such as LoRA, QLoRA, etc.""",
+        # ),
+        # "TRL Docs": (
+        #     custom_retriever_trl,
+        #     "TRL_information",
+        #     """Useful for questions asking about fine-tuning LLMs with reinforcement learning (RLHF). Includes information about the Supervised Fine-tuning step (SFT), Reward Modeling step (RM), and the Proximal Policy Optimization (PPO) step.""",
+        # ),
+        # "LlamaIndex Docs": (
+        #     custom_retriever_llama_index,
+        #     "LlamaIndex_information",
+        #     """Useful for questions asking about retrieval augmented generation (RAG) with LLMs and embedding models. It is the documentation of a framework, includes info about fine-tuning embedding models, building chatbots, and agents with llms, using vector databases, embeddings, information retrieval with cosine similarity or bm25, etc.""",
+        # ),
+        # "OpenAI Cookbooks": (
+        #     custom_retriever_openai_cookbooks,
+        #     "openai_cookbooks_info",
+        #     """Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API,  and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
+        # ),
+        # "LangChain Docs": (
+        #     custom_retriever_langchain,
+        #     "langchain_info",
+        #     """Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""",
+        # ),
+        "All Sources": (
+            custom_retriever_all_sources,
+            "all_sources_info",
+            """Useful for questions asking about information in the field of AI.""",
         ),
     }
     memory,
 ):
     with logfire.span("Running query"):
+        logfire.info(f"User query: {query}")
         chat_list = memory.get()
         client = llm._get_client()
         logfire.instrument_openai(client)
+        query_engine_tools = update_query_engine_tools(["All Sources"])
+        filter_list = []
+        source_mapping = {
+            "Transformers Docs": "transformers",
+            "PEFT Docs": "peft",
+            "TRL Docs": "trl",
+            "LlamaIndex Docs": "llama_index",
+            "LangChain Docs": "langchain",
+            "OpenAI Cookbooks": "openai_cookbooks",
+            "Towards AI Blog": "tai_blog",
+        }
+        for source in sources:
+            if source in source_mapping:
+                filter_list.append(
+                    MetadataFilter(
+                        key="source",
+                        operator=FilterOperator.EQ,
+                        value=source_mapping[source],
+                    )
+                )
+        filters = MetadataFilters(
+            filters=filter_list,
+            condition=FilterCondition.OR,
+        )
+        query_engine_tools[0].retriever._vector_retriever._filters = filters
         agent = OpenAIAgent.from_tools(
             llm=llm,
     )
     document_template: str = "[🔗 {source}: {title}]({url}), relevance: {score:2.2f}"
     all_documents = []
+    for source in completion.sources:  # looping over list[ToolOutput]
+        if isinstance(source.raw_output, Exception):
+            logfire.error(f"Error in source output: {source.raw_output}")
+            # pdb.set_trace()
+            continue
+        if not isinstance(source.raw_output, list):
+            logfire.warn(f"Unexpected source output type: {type(source.raw_output)}")
+            continue
+        for src in source.raw_output:  # looping over list[NodeWithScore]
             document = document_template.format(
                 title=src.metadata["title"],
                 score=src.score,
         "LlamaIndex Docs",
         "LangChain Docs",
         "OpenAI Cookbooks",
+        # "All Sources",
     ],
     interactive=True,
 )
 model = gr.Dropdown(
     [
         "gpt-4o-mini",
     ],
     label="Model",
     value="gpt-4o-mini",

scripts/prompts.py CHANGED Viewed

@@ -1,19 +1,123 @@
-system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs. Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback. Questions should be understood in this context.
-Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.
-Use the available tools to gather insights pertinent to the field of AI. Always use two tools at the same time. These tools accept a string (a user query rewritten as a statement) and return informative content regarding the domain of AI.
-e.g:
-User question: 'How can I fine-tune an LLM?'
-Input to the tool: 'Fine-tuning an LLM'
-User question: How can quantize an LLM?
-Input to the tool: 'Quantization for LLMs'
-User question: 'Teach me how to build an AI agent"'
-Input to the tool: 'Building an AI Agent'
-Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.
 Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
@@ -25,11 +129,9 @@ Should the tools repository lack information on the queried topic, politely info
 At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure reformulate the question to the tool to capture this new angle or more profound layer of inquiry.
-Do not refer to the documentation directly, but use the information provided within it to answer questions.
 If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.
 Make sure to format your answers in Markdown format, including code blocks and snippets.
-Politely reject questions not related to AI, while being cautious not to reject unfamiliar terms or acronyms too quickly. If a question seems unrelated but you suspect it might contain AI-related terminology.
 """

+# # Prompt 1
+# system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs.
+# Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks such as Langchain and LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback (RLHF). Questions should be understood in this context.
+# Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.
+# Use the available tools to gather insights pertinent to the field of AI.
+# To answer student questions, always use the all_sources_info tool plus another one simultaneously. Meaning that should be using two tools in total.
+# Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.
+# Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
+# When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.
+# Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.
+# Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.
+# At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure reformulate the question to the tool to capture this new angle or more profound layer of inquiry.
+# Do not refer to the documentation directly, but use the information provided within it to answer questions.
+# If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.
+# Make sure to format your answers in Markdown format, including code blocks and snippets.
+# """
+# Prompt 2
+# system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs.
+# Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks such as Langchain and LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback (RLHF). Questions should be understood in this context.
+# Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.
+# Use the available tools to gather insights pertinent to the field of AI.
+# To answer student questions, always use the all_sources_info tool. For complex questions, you can decompose the user question into TWO sub questions (you are limited to two sub-questions) that can be answered by the tools.
+# These are the guidelines to consider if you decide to create sub questions:
+# * Be as specific as possible
+# * The two sub questions should be relevant to the user question
+# * The two sub questions should be answerable by the tools provided
+# Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.
+# Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
+# When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.
+# Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.
+# Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.
+# At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure reformulate the question to the tool to capture this new angle or more profound layer of inquiry.
+# Do not refer to the documentation directly, but use the information provided within it to answer questions.
+# If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.
+# Make sure to format your answers in Markdown format, including code blocks and snippets.
+# """
+# # Prompt 3
+# system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs.
+# Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks such as Langchain and LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback (RLHF). Questions should be understood in this context.
+# Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.
+# Use the available tools to gather insights pertinent to the field of AI.
+# To answer student questions, always use the all_sources_info tool. For each question, you should decompose the user question into TWO sub questions (you are limited to two sub-questions) that can be answered by the tools.
+# These are the guidelines to consider when creating sub questions:
+# * Be as specific as possible
+# * The two sub questions should be relevant to the user question
+# * The two sub questions should be answerable by the tools provided
+# Only some information returned by the tools might be relevant to the user question, so ignore the irrelevant part and answer the user question with what you have.
+# Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
+# When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.
+# Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.
+# Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.
+# At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure reformulate the question to the tool to capture this new angle or more profound layer of inquiry.
+# Do not refer to the documentation directly, but use the information provided within it to answer questions.
+# If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.
+# Make sure to format your answers in Markdown format, including code blocks and snippets.
+# """
+# Prompt 4 Trying to make it like #1
+system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs.
+Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks such as Langchain and LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback (RLHF). Questions should be understood in this context.
+Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.
+Use the available tools to gather insights pertinent to the field of AI.
+To answer student questions, always use the all_sources_info tool plus another one simultaneously.
+Decompose the user question into TWO sub questions (you are limited to two sub-questions) one for each tool.
+Meaning that should be using two tools in total for each user question.
+These are the guidelines to consider if you decide to create sub questions:
+* Be as specific as possible
+* The two sub questions should be relevant to the user question
+* The two sub questions should be answerable by the tools provided
+Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.
 Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.
 At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure reformulate the question to the tool to capture this new angle or more profound layer of inquiry.
+Do not refer to the documentation directly, but use the information provided within it to answer questions.
 If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.
 Make sure to format your answers in Markdown format, including code blocks and snippets.
 """

scripts/setup.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import logging
 import os
 import pickle
@@ -6,9 +8,16 @@ import chromadb
 import logfire
 from custom_retriever import CustomRetriever
 from dotenv import load_dotenv
-from llama_index.core import VectorStoreIndex
 from llama_index.core.node_parser import SentenceSplitter
-from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from utils import init_mongo_db
@@ -21,11 +30,11 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
 logfire.configure()
-if not os.path.exists("data/chroma-db-transformers"):
     # Download the vector database from the Hugging Face Hub if it doesn't exist locally
     # https://huggingface.co/datasets/towardsai-buster/ai-tutor-vector-db/tree/main
     logfire.warn(
-        f"Vector database does not exist at 'data/chroma-db-transformers', downloading from Hugging Face Hub"
     )
     from huggingface_hub import snapshot_download
@@ -34,51 +43,127 @@ if not os.path.exists("data/chroma-db-transformers"):
         local_dir="data",
         repo_type="dataset",
     )
-    logfire.info(f"Downloaded vector database to 'data/chroma-db-transformers'")
 def setup_database(db_collection, dict_file_name):
     db = chromadb.PersistentClient(path=f"data/{db_collection}")
     chroma_collection = db.get_or_create_collection(db_collection)
     vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
     index = VectorStoreIndex.from_vector_store(
         vector_store=vector_store,
-        embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
-        transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],
         show_progress=True,
         use_async=True,
     )
     vector_retriever = VectorIndexRetriever(
         index=index,
         similarity_top_k=15,
         use_async=True,
-        embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
     )
     with open(f"data/{db_collection}/{dict_file_name}", "rb") as f:
         document_dict = pickle.load(f)
-    return CustomRetriever(vector_retriever, document_dict)
 # Setup retrievers
-custom_retriever_transformers = setup_database(
-    "chroma-db-transformers",
-    "document_dict_transformers.pkl",
-)
-custom_retriever_peft = setup_database("chroma-db-peft", "document_dict_peft.pkl")
-custom_retriever_trl = setup_database("chroma-db-trl", "document_dict_trl.pkl")
-custom_retriever_llama_index = setup_database(
-    "chroma-db-llama_index",
-    "document_dict_llama_index.pkl",
-)
-custom_retriever_openai_cookbooks = setup_database(
-    "chroma-db-openai_cookbooks",
-    "document_dict_openai_cookbooks.pkl",
-)
-custom_retriever_langchain = setup_database(
-    "chroma-db-langchain",
-    "document_dict_langchain.pkl",
 )
 # Constants
@@ -92,7 +177,8 @@ AVAILABLE_SOURCES_UI = [
     "LlamaIndex Docs",
     "LangChain Docs",
     "OpenAI Cookbooks",
-    # "Towards AI Blog",
     # "RAG Course",
 ]
@@ -103,7 +189,8 @@ AVAILABLE_SOURCES = [
     "llama_index",
     "langchain",
     "openai_cookbooks",
-    # "towards_ai_blog",
     # "rag_course",
 ]
@@ -114,12 +201,13 @@ mongo_db = (
 )
 __all__ = [
-    "custom_retriever_transformers",
-    "custom_retriever_peft",
-    "custom_retriever_trl",
-    "custom_retriever_llama_index",
-    "custom_retriever_openai_cookbooks",
-    "custom_retriever_langchain",
     "mongo_db",
     "CONCURRENCY_COUNT",
     "AVAILABLE_SOURCES_UI",

+import asyncio
+import json
 import logging
 import os
 import pickle
 import logfire
 from custom_retriever import CustomRetriever
 from dotenv import load_dotenv
+from evaluate_rag_system import AsyncKeywordTableSimpleRetriever
+from llama_index.core import Document, SimpleKeywordTableIndex, VectorStoreIndex
+from llama_index.core.ingestion import IngestionPipeline
 from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.retrievers import (
+    KeywordTableSimpleRetriever,
+    VectorIndexRetriever,
+)
+from llama_index.core.schema import NodeWithScore, QueryBundle
+from llama_index.embeddings.cohere import CohereEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from utils import init_mongo_db
 logfire.configure()
+if not os.path.exists("data/chroma-db-all_sources"):
     # Download the vector database from the Hugging Face Hub if it doesn't exist locally
     # https://huggingface.co/datasets/towardsai-buster/ai-tutor-vector-db/tree/main
     logfire.warn(
+        f"Vector database does not exist at 'data/chroma-db-all_sources', downloading from Hugging Face Hub"
     )
     from huggingface_hub import snapshot_download
         local_dir="data",
         repo_type="dataset",
     )
+    logfire.info(f"Downloaded vector database to 'data/chroma-db-all_sources'")
+def create_docs(input_file: str) -> list[Document]:
+    with open(input_file, "r") as f:
+        documents = []
+        for line in f:
+            data = json.loads(line)
+            documents.append(
+                Document(
+                    doc_id=data["doc_id"],
+                    text=data["content"],
+                    metadata={  # type: ignore
+                        "url": data["url"],
+                        "title": data["name"],
+                        "tokens": data["tokens"],
+                        "retrieve_doc": data["retrieve_doc"],
+                        "source": data["source"],
+                    },
+                    excluded_llm_metadata_keys=[
+                        "title",
+                        "tokens",
+                        "retrieve_doc",
+                        "source",
+                    ],
+                    excluded_embed_metadata_keys=[
+                        "url",
+                        "tokens",
+                        "retrieve_doc",
+                        "source",
+                    ],
+                )
+            )
+    return documents
 def setup_database(db_collection, dict_file_name):
     db = chromadb.PersistentClient(path=f"data/{db_collection}")
     chroma_collection = db.get_or_create_collection(db_collection)
     vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+    embed_model = CohereEmbedding(
+        api_key=os.environ["COHERE_API_KEY"],
+        model_name="embed-english-v3.0",
+        input_type="search_query",
+    )
     index = VectorStoreIndex.from_vector_store(
         vector_store=vector_store,
+        transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=0)],
         show_progress=True,
         use_async=True,
     )
     vector_retriever = VectorIndexRetriever(
         index=index,
         similarity_top_k=15,
+        embed_model=embed_model,
         use_async=True,
     )
     with open(f"data/{db_collection}/{dict_file_name}", "rb") as f:
         document_dict = pickle.load(f)
+    with open("data/keyword_retriever_sync.pkl", "rb") as f:
+        keyword_retriever: KeywordTableSimpleRetriever = pickle.load(f)
+    # # Creating the keyword index and retriever
+    # logfire.info("Creating nodes from documents")
+    # documents = create_docs("data/all_sources_data.jsonl")
+    # pipeline = IngestionPipeline(
+    #     transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=0)]
+    # )
+    # all_nodes = pipeline.run(documents=documents, show_progress=True)
+    # # with open("data/all_nodes.pkl", "wb") as f:
+    # #     pickle.dump(all_nodes, f)
+    # # all_nodes = pickle.load(open("data/all_nodes.pkl", "rb"))
+    # logfire.info(f"Number of nodes: {len(all_nodes)}")
+    # keyword_index = SimpleKeywordTableIndex(
+    #     nodes=all_nodes, max_keywords_per_chunk=10, show_progress=True, use_async=False
+    # )
+    # # with open("data/keyword_index.pkl", "wb") as f:
+    # # pickle.dump(keyword_index, f)
+    # # keyword_index = pickle.load(open("data/keyword_index.pkl", "rb"))
+    # logfire.info("Creating keyword retriever")
+    # keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)
+    # with open("data/keyword_retriever_sync.pkl", "wb") as f:
+    #     pickle.dump(keyword_retriever, f)
+    return CustomRetriever(vector_retriever, document_dict, keyword_retriever, "OR")
 # Setup retrievers
+# custom_retriever_transformers: CustomRetriever = setup_database(
+#     "chroma-db-transformers",
+#     "document_dict_transformers.pkl",
+# )
+# custom_retriever_peft: CustomRetriever = setup_database(
+#     "chroma-db-peft", "document_dict_peft.pkl"
+# )
+# custom_retriever_trl: CustomRetriever = setup_database(
+#     "chroma-db-trl", "document_dict_trl.pkl"
+# )
+# custom_retriever_llama_index: CustomRetriever = setup_database(
+#     "chroma-db-llama_index",
+#     "document_dict_llama_index.pkl",
+# )
+# custom_retriever_openai_cookbooks: CustomRetriever = setup_database(
+#     "chroma-db-openai_cookbooks",
+#     "document_dict_openai_cookbooks.pkl",
+# )
+# custom_retriever_langchain: CustomRetriever = setup_database(
+#     "chroma-db-langchain",
+#     "document_dict_langchain.pkl",
+# )
+custom_retriever_all_sources: CustomRetriever = setup_database(
+    "chroma-db-all_sources",
+    "document_dict_all_sources.pkl",
 )
 # Constants
     "LlamaIndex Docs",
     "LangChain Docs",
     "OpenAI Cookbooks",
+    "Towards AI Blog",
+    # "All Sources",
     # "RAG Course",
 ]
     "llama_index",
     "langchain",
     "openai_cookbooks",
+    "tai_blog",
+    # "all_sources",
     # "rag_course",
 ]
 )
 __all__ = [
+    # "custom_retriever_transformers",
+    # "custom_retriever_peft",
+    # "custom_retriever_trl",
+    # "custom_retriever_llama_index",
+    # "custom_retriever_openai_cookbooks",
+    # "custom_retriever_langchain",
+    "custom_retriever_all_sources",
     "mongo_db",
     "CONCURRENCY_COUNT",
     "AVAILABLE_SOURCES_UI",