add context to nodes script
Browse files
scripts/contextual_retrieval.py β data/scraping_scripts/add_context_to_nodes.py
RENAMED
@@ -9,8 +9,6 @@ import logfire
|
|
9 |
import tiktoken
|
10 |
from anthropic import AsyncAnthropic
|
11 |
from dotenv import load_dotenv
|
12 |
-
|
13 |
-
# from instructor import AsyncInstructor, Mode, patch
|
14 |
from jinja2 import Template
|
15 |
from llama_index.core import Document
|
16 |
from llama_index.core.ingestion import IngestionPipeline
|
@@ -115,31 +113,6 @@ Answer only with the succinct context and nothing else.
|
|
115 |
return response.context
|
116 |
|
117 |
|
118 |
-
# async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
|
119 |
-
|
120 |
-
# doc_id: str = node.source_node.node_id # type: ignore
|
121 |
-
# doc: Document = document_dict[doc_id]
|
122 |
-
|
123 |
-
# if doc.metadata["tokens"] > 120_000:
|
124 |
-
# # Tokenize the document text
|
125 |
-
# encoding = tiktoken.encoding_for_model("gpt-4o-mini")
|
126 |
-
# tokens = encoding.encode(doc.text)
|
127 |
-
|
128 |
-
# # Trim to 120,000 tokens
|
129 |
-
# trimmed_tokens = tokens[:120_000]
|
130 |
-
|
131 |
-
# # Decode back to text
|
132 |
-
# trimmed_text = encoding.decode(trimmed_tokens)
|
133 |
-
|
134 |
-
# # Update the document text
|
135 |
-
# doc.text = trimmed_text
|
136 |
-
# doc.metadata["tokens"] = 120_000
|
137 |
-
|
138 |
-
# context: str = await situate_context(doc.text, node.text)
|
139 |
-
# node.text = f"{node.text}\n\n{context}"
|
140 |
-
# return node
|
141 |
-
|
142 |
-
|
143 |
async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
|
144 |
doc_id: str = node.source_node.node_id # type: ignore
|
145 |
doc: Document = document_dict[doc_id]
|
@@ -228,17 +201,3 @@ async def main():
|
|
228 |
|
229 |
if __name__ == "__main__":
|
230 |
asyncio.run(main())
|
231 |
-
|
232 |
-
|
233 |
-
# Ok so I need to create a new chroma-db-all_sources that embedded (context+chunk)
|
234 |
-
# I need to create an index and instead of from_documents it will be from_nodes
|
235 |
-
|
236 |
-
|
237 |
-
# First I need to create contexts for each chunk. Create a list of tasks (doc + chunk)
|
238 |
-
|
239 |
-
|
240 |
-
# documents = create_docs("data/all_sources_data.jsonl")
|
241 |
-
# pipeline = IngestionPipeline(
|
242 |
-
# transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=0)]
|
243 |
-
# )
|
244 |
-
# all_nodes = pipeline.run(documents=documents, show_progress=True)
|
|
|
9 |
import tiktoken
|
10 |
from anthropic import AsyncAnthropic
|
11 |
from dotenv import load_dotenv
|
|
|
|
|
12 |
from jinja2 import Template
|
13 |
from llama_index.core import Document
|
14 |
from llama_index.core.ingestion import IngestionPipeline
|
|
|
113 |
return response.context
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
|
117 |
doc_id: str = node.source_node.node_id # type: ignore
|
118 |
doc: Document = document_dict[doc_id]
|
|
|
201 |
|
202 |
if __name__ == "__main__":
|
203 |
asyncio.run(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|