Spaces:
Runtime error
Runtime error
File size: 2,533 Bytes
8a58cf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
"""General node utils."""
import logging
from typing import List
from gpt_index.data_structs.data_structs import Node
from gpt_index.langchain_helpers.text_splitter import (
TextSplit,
TextSplitter,
TokenTextSplitter,
)
from gpt_index.schema import BaseDocument
from gpt_index.utils import truncate_text
def get_text_splits_from_document(
document: BaseDocument,
text_splitter: TextSplitter,
include_extra_info: bool = True,
) -> List[TextSplit]:
"""Break the document into chunks with additional info."""
# TODO: clean up since this only exists due to the diff w LangChain's TextSplitter
text_splits = []
if isinstance(text_splitter, TokenTextSplitter):
# use this to extract extra information about the chunks
text_splits = text_splitter.split_text_with_overlaps(
document.get_text(),
extra_info_str=document.extra_info_str if include_extra_info else None,
)
else:
text_chunks = text_splitter.split_text(
document.get_text(),
)
text_splits = [TextSplit(text_chunk=text_chunk) for text_chunk in text_chunks]
return text_splits
def get_nodes_from_document(
document: BaseDocument,
text_splitter: TextSplitter,
start_idx: int = 0,
include_extra_info: bool = True,
) -> List[Node]:
"""Add document to index."""
text_splits = get_text_splits_from_document(
document=document,
text_splitter=text_splitter,
include_extra_info=include_extra_info,
)
nodes = []
index_counter = 0
for i, text_split in enumerate(text_splits):
text_chunk = text_split.text_chunk
logging.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}")
index_pos_info = None
if text_split.num_char_overlap is not None:
index_pos_info = {
# NOTE: start is inclusive, end is exclusive
"start": index_counter - text_split.num_char_overlap,
"end": index_counter - text_split.num_char_overlap + len(text_chunk),
}
index_counter += len(text_chunk) + 1
# if embedding specified in document, pass it to the Node
node = Node(
text=text_chunk,
index=start_idx + i,
ref_doc_id=document.get_doc_id(),
embedding=document.embedding,
extra_info=document.extra_info if include_extra_info else None,
node_info=index_pos_info,
)
nodes.append(node)
return nodes
|