SOAPAssist / gpt_index /indices /node_utils.py
AbeerTrial's picture
Upload folder using huggingface_hub
8a58cf3
"""General node utils."""
import logging
from typing import List
from gpt_index.data_structs.data_structs import Node
from gpt_index.langchain_helpers.text_splitter import (
TextSplit,
TextSplitter,
TokenTextSplitter,
)
from gpt_index.schema import BaseDocument
from gpt_index.utils import truncate_text
def get_text_splits_from_document(
document: BaseDocument,
text_splitter: TextSplitter,
include_extra_info: bool = True,
) -> List[TextSplit]:
"""Break the document into chunks with additional info."""
# TODO: clean up since this only exists due to the diff w LangChain's TextSplitter
text_splits = []
if isinstance(text_splitter, TokenTextSplitter):
# use this to extract extra information about the chunks
text_splits = text_splitter.split_text_with_overlaps(
document.get_text(),
extra_info_str=document.extra_info_str if include_extra_info else None,
)
else:
text_chunks = text_splitter.split_text(
document.get_text(),
)
text_splits = [TextSplit(text_chunk=text_chunk) for text_chunk in text_chunks]
return text_splits
def get_nodes_from_document(
document: BaseDocument,
text_splitter: TextSplitter,
start_idx: int = 0,
include_extra_info: bool = True,
) -> List[Node]:
"""Add document to index."""
text_splits = get_text_splits_from_document(
document=document,
text_splitter=text_splitter,
include_extra_info=include_extra_info,
)
nodes = []
index_counter = 0
for i, text_split in enumerate(text_splits):
text_chunk = text_split.text_chunk
logging.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}")
index_pos_info = None
if text_split.num_char_overlap is not None:
index_pos_info = {
# NOTE: start is inclusive, end is exclusive
"start": index_counter - text_split.num_char_overlap,
"end": index_counter - text_split.num_char_overlap + len(text_chunk),
}
index_counter += len(text_chunk) + 1
# if embedding specified in document, pass it to the Node
node = Node(
text=text_chunk,
index=start_idx + i,
ref_doc_id=document.get_doc_id(),
embedding=document.embedding,
extra_info=document.extra_info if include_extra_info else None,
node_info=index_pos_info,
)
nodes.append(node)
return nodes