omarsol commited on
Commit
0591920
Β·
1 Parent(s): cd34252

add context to nodes script

Browse files
scripts/contextual_retrieval.py β†’ data/scraping_scripts/add_context_to_nodes.py RENAMED
@@ -9,8 +9,6 @@ import logfire
9
  import tiktoken
10
  from anthropic import AsyncAnthropic
11
  from dotenv import load_dotenv
12
-
13
- # from instructor import AsyncInstructor, Mode, patch
14
  from jinja2 import Template
15
  from llama_index.core import Document
16
  from llama_index.core.ingestion import IngestionPipeline
@@ -115,31 +113,6 @@ Answer only with the succinct context and nothing else.
115
  return response.context
116
 
117
 
118
- # async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
119
-
120
- # doc_id: str = node.source_node.node_id # type: ignore
121
- # doc: Document = document_dict[doc_id]
122
-
123
- # if doc.metadata["tokens"] > 120_000:
124
- # # Tokenize the document text
125
- # encoding = tiktoken.encoding_for_model("gpt-4o-mini")
126
- # tokens = encoding.encode(doc.text)
127
-
128
- # # Trim to 120,000 tokens
129
- # trimmed_tokens = tokens[:120_000]
130
-
131
- # # Decode back to text
132
- # trimmed_text = encoding.decode(trimmed_tokens)
133
-
134
- # # Update the document text
135
- # doc.text = trimmed_text
136
- # doc.metadata["tokens"] = 120_000
137
-
138
- # context: str = await situate_context(doc.text, node.text)
139
- # node.text = f"{node.text}\n\n{context}"
140
- # return node
141
-
142
-
143
  async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
144
  doc_id: str = node.source_node.node_id # type: ignore
145
  doc: Document = document_dict[doc_id]
@@ -228,17 +201,3 @@ async def main():
228
 
229
  if __name__ == "__main__":
230
  asyncio.run(main())
231
-
232
-
233
- # Ok so I need to create a new chroma-db-all_sources that embedded (context+chunk)
234
- # I need to create an index and instead of from_documents it will be from_nodes
235
-
236
-
237
- # First I need to create contexts for each chunk. Create a list of tasks (doc + chunk)
238
-
239
-
240
- # documents = create_docs("data/all_sources_data.jsonl")
241
- # pipeline = IngestionPipeline(
242
- # transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=0)]
243
- # )
244
- # all_nodes = pipeline.run(documents=documents, show_progress=True)
 
9
  import tiktoken
10
  from anthropic import AsyncAnthropic
11
  from dotenv import load_dotenv
 
 
12
  from jinja2 import Template
13
  from llama_index.core import Document
14
  from llama_index.core.ingestion import IngestionPipeline
 
113
  return response.context
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  async def process_chunk(node: TextNode, document_dict: dict) -> TextNode:
117
  doc_id: str = node.source_node.node_id # type: ignore
118
  doc: Document = document_dict[doc_id]
 
201
 
202
  if __name__ == "__main__":
203
  asyncio.run(main())