import nltk import pandas as pd import numpy as np nltk.data.path.append("/content/nltk_data") nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('punkt_tab') from nltk.tokenize import sent_tokenize #Splits a list of sentences into overlapping chunks using a sliding window approach. #sentences (list): List of sentences to split into chunks. # window_size (int): Number of sentences in each chunk. Default is 6. # overlap (int): Number of overlapping sentences between consecutive chunks. Default is 3. # Returns: # list: List of text chunks, where each chunk is a string of concatenated sentences. def split_into_sliding_windows(sentences, window_size=6, overlap=3): # Validate input parameters if window_size <= overlap: raise ValueError("window_size must be greater than overlap.") if not sentences: return [] chunks = [] step = window_size - overlap # How much to move the window each time # Iterate over the sentences with the specified step size for i in range(0, len(sentences), step): chunk = sentences[i:i + window_size] if len(chunk) >= overlap: # Ensure chunks have minimum required overlap chunks.append(" ".join(chunk)) # Join sentences into a text block return chunks # Processes documents using a sliding window approach and inserts sentence chunks into Milvus. #Args: model: The embedding model used to generate document embeddings. # extracted_data: Pandas DataFrame containing the extracted data. # collectionInstance: Milvus collection instance to insert data into. # window_size: Number of sentences in each chunk. # overlap: Number of overlapping sentences between consecutive chunks. # def EmbedAllDocumentsAndInsert(model, extracted_data, collectionInstance, window_size=5, overlap=2): count = 0 total_docs = len(extracted_data) print(f"Total documents: {total_docs}") for index, row in extracted_data.iterrows(): document = row["documents"] # Extract the document text doc_id = row["id"] # Extract the document ID doccontextrel = row["gpt3_context_relevance"] # Extract context relevance score doccontextutil = row["gpt35_utilization"] # Extract context utilization score docadherence = row["gpt3_adherence"] # Extract adherence score datasetname = row["dataset_name"] # Extract dataset name relevance_score = row["relevance_score"] # Extract relevance score utilization_score = row["utilization_score"] # Extract utilization score completeness_score = row["completeness_score"] # Extract completeness score if isinstance(document, list): # Flatten the list into a single string document = " ".join([str(item) for item in document if isinstance(item, str)]) elif not isinstance(document, str): # If the document is not a string or list, convert it to a string document = str(document) # Step 1: Tokenize document into sentences sentences = sent_tokenize(document) if isinstance(document, str) else document # Step 2: Generate overlapping chunks chunks = split_into_sliding_windows(sentences, window_size, overlap) print(f"Total chunks for document {index}: {len(chunks)}") for chunk_index, chunk_text in enumerate(chunks): # Step 3: Generate embedding for each chunk chunk_vector = np.array(model.encode(chunk_text), dtype=np.float32).flatten().tolist() print(f"chunk_index= {chunk_index}") # Step 4: Insert chunk into Milvus as separate columns insert_embeddings_into_milvus( collectionInstance, chunk_vector, f"{chunk_index}__{doc_id}", # Unique ID for chunk doc_id, # Unique ID for doc index, float(doccontextrel) if pd.notna(doccontextrel) else 0.0, # Handle NaN values float(doccontextutil) if pd.notna(doccontextutil) else 0.0, # Handle NaN values float(docadherence) if pd.notna(docadherence) else 0.0, # Handle NaN values datasetname, # Dataset name column float(relevance_score) if pd.notna(relevance_score) else 0.0, # Handle NaN values float(utilization_score) if pd.notna(utilization_score) else 0.0, # Handle NaN values float(completeness_score) if pd.notna(completeness_score) else 0.0 # Handle NaN values ) count += 1 if count % 1000 == 0: print(f"Uploaded {count} chunks to Milvus.") # Inserts document embeddings into Milvus along with metadata. #Args: # collection: Milvus collection instance. # embeddings: Embedding vector for the chunk. # chunk_doc_id: Unique ID for the chunk. # doc_id: Unique ID for the document. # index: Index of the document in the dataset. # doccontextrel: Context relevance score. # doccontextutil: Context utilization score. # docadherence: Adherence score. # datasetname: Name of the dataset. def insert_embeddings_into_milvus(collection, embeddings, chunk_doc_id, doc_id, index, doccontextrel, doccontextutil, docadherence, datasetname, relevance_score, utilization_score, completeness_score): try: print(f"Inserting chunk {chunk_doc_id} doc {doc_id} (index {index})") insert_data = [ [str(chunk_doc_id)], # Primary key field (document_id) [str(doc_id)], # Document ID field [embeddings], # Vector field (embedding) [float(doccontextrel)], # Relevance score field [float(doccontextutil)], # Utilization score field [float(docadherence)], # Adherence score field [str(datasetname)], # Dataset name field [float(relevance_score)], # Relevance score field [float(utilization_score)], # Utilization score field [float(completeness_score)] # Completeness score field ] collection.insert(insert_data) except Exception as e: print(f"Error inserting chunk {chunk_doc_id} doc {doc_id} (index {index}): {e}")