# 获取路径所有文件名 import os data_path = '/data1/home/purui/projects/chatbot/data' def get_all_files(path, file_type): file_paths = [] sub_path = path + f'/{file_type}' for root, dirs, files in os.walk(sub_path): for file in files: file_paths.append(os.path.join(root, file)) return file_paths from llama_index.core import SimpleDirectoryReader def load(path, file_type): docs = [] files = get_all_files(path, file_type) for file in files: file_name = file.split('/')[-1].split('.')[0] doc = SimpleDirectoryReader(input_files=[file]).load_data()[0] doc.metadata.update({"file_name": file_name}) docs.append(doc) return docs doc = load(data_path, 'pdf') print(f"Total file: {len(doc)}") from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter from llama_index.llms.ollama import Ollama from langchain_community.embeddings.ollama import OllamaEmbeddings from llama_index.core import VectorStoreIndex from llama_index.core.postprocessor import MetadataReplacementPostProcessor from llama_index.core import Settings node_parser = SentenceWindowNodeParser.from_defaults( window_size=3, window_metadata_key="window", original_text_metadata_key="original_sentence", ) text_splitter = SentenceSplitter() llm = Ollama(model="llama3") embed_model = OllamaEmbeddings(model="llama3") Settings.llm = llm Settings.embed_model = embed_model Settings.text_splitter = text_splitter nodes = node_parser.get_nodes_from_documents(documents=[doc[0]]) base_nodes = text_splitter.get_nodes_from_documents(documents=[doc[0]]) sentence_index = VectorStoreIndex(nodes) base_index = VectorStoreIndex(base_nodes) query_engine = sentence_index.as_query_engine( similarity_top_k=2, # the target key defaults to `window` to match the node_parser's default node_postprocessors=[ MetadataReplacementPostProcessor(target_metadata_key="window") ], ) window_response = query_engine.query("Who is Alice?") window = window_response.source_nodes[0].node.metadata["window"] sentence = window_response.source_nodes[0].node.metadata["original_sentence"] print(f"Window: {window}") print("------------------") print(f"Original Sentence: {sentence}")