Spaces:
Running
Running
import nltk | |
import pandas as pd | |
import numpy as np | |
nltk.data.path.append("/content/nltk_data") | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
nltk.download('omw-1.4') | |
nltk.download('punkt_tab') | |
from nltk.tokenize import sent_tokenize | |
#Splits a list of sentences into overlapping chunks using a sliding window approach. | |
#sentences (list): List of sentences to split into chunks. | |
# window_size (int): Number of sentences in each chunk. Default is 6. | |
# overlap (int): Number of overlapping sentences between consecutive chunks. Default is 3. | |
# Returns: | |
# list: List of text chunks, where each chunk is a string of concatenated sentences. | |
def split_into_sliding_windows(sentences, window_size=6, overlap=3): | |
# Validate input parameters | |
if window_size <= overlap: | |
raise ValueError("window_size must be greater than overlap.") | |
if not sentences: | |
return [] | |
chunks = [] | |
step = window_size - overlap # How much to move the window each time | |
# Iterate over the sentences with the specified step size | |
for i in range(0, len(sentences), step): | |
chunk = sentences[i:i + window_size] | |
if len(chunk) >= overlap: # Ensure chunks have minimum required overlap | |
chunks.append(" ".join(chunk)) # Join sentences into a text block | |
return chunks | |
# Processes documents using a sliding window approach and inserts sentence chunks into Milvus. | |
#Args: model: The embedding model used to generate document embeddings. | |
# extracted_data: Pandas DataFrame containing the extracted data. | |
# collectionInstance: Milvus collection instance to insert data into. | |
# window_size: Number of sentences in each chunk. | |
# overlap: Number of overlapping sentences between consecutive chunks. | |
# | |
def EmbedAllDocumentsAndInsert(model, extracted_data, collectionInstance, window_size=5, overlap=2): | |
count = 0 | |
total_docs = len(extracted_data) | |
print(f"Total documents: {total_docs}") | |
for index, row in extracted_data.iterrows(): | |
document = row["documents"] # Extract the document text | |
doc_id = row["id"] # Extract the document ID | |
doccontextrel = row["gpt3_context_relevance"] # Extract context relevance score | |
doccontextutil = row["gpt35_utilization"] # Extract context utilization score | |
docadherence = row["gpt3_adherence"] # Extract adherence score | |
datasetname = row["dataset_name"] # Extract dataset name | |
relevance_score = row["relevance_score"] # Extract relevance score | |
utilization_score = row["utilization_score"] # Extract utilization score | |
completeness_score = row["completeness_score"] # Extract completeness score | |
if isinstance(document, list): | |
# Flatten the list into a single string | |
document = " ".join([str(item) for item in document if isinstance(item, str)]) | |
elif not isinstance(document, str): | |
# If the document is not a string or list, convert it to a string | |
document = str(document) | |
# Step 1: Tokenize document into sentences | |
sentences = sent_tokenize(document) if isinstance(document, str) else document | |
# Step 2: Generate overlapping chunks | |
chunks = split_into_sliding_windows(sentences, window_size, overlap) | |
print(f"Total chunks for document {index}: {len(chunks)}") | |
for chunk_index, chunk_text in enumerate(chunks): | |
# Step 3: Generate embedding for each chunk | |
chunk_vector = np.array(model.encode(chunk_text), dtype=np.float32).flatten().tolist() | |
print(f"chunk_index= {chunk_index}") | |
# Step 4: Insert chunk into Milvus as separate columns | |
insert_embeddings_into_milvus( | |
collectionInstance, | |
chunk_vector, | |
f"{chunk_index}__{doc_id}", # Unique ID for chunk | |
doc_id, # Unique ID for doc | |
index, | |
float(doccontextrel) if pd.notna(doccontextrel) else 0.0, # Handle NaN values | |
float(doccontextutil) if pd.notna(doccontextutil) else 0.0, # Handle NaN values | |
float(docadherence) if pd.notna(docadherence) else 0.0, # Handle NaN values | |
datasetname, # Dataset name column | |
float(relevance_score) if pd.notna(relevance_score) else 0.0, # Handle NaN values | |
float(utilization_score) if pd.notna(utilization_score) else 0.0, # Handle NaN values | |
float(completeness_score) if pd.notna(completeness_score) else 0.0 # Handle NaN values | |
) | |
count += 1 | |
if count % 1000 == 0: | |
print(f"Uploaded {count} chunks to Milvus.") | |
# Inserts document embeddings into Milvus along with metadata. | |
#Args: | |
# collection: Milvus collection instance. | |
# embeddings: Embedding vector for the chunk. | |
# chunk_doc_id: Unique ID for the chunk. | |
# doc_id: Unique ID for the document. | |
# index: Index of the document in the dataset. | |
# doccontextrel: Context relevance score. | |
# doccontextutil: Context utilization score. | |
# docadherence: Adherence score. | |
# datasetname: Name of the dataset. | |
def insert_embeddings_into_milvus(collection, embeddings, chunk_doc_id, doc_id, index, | |
doccontextrel, doccontextutil, docadherence, datasetname, | |
relevance_score, utilization_score, completeness_score): | |
try: | |
print(f"Inserting chunk {chunk_doc_id} doc {doc_id} (index {index})") | |
insert_data = [ | |
[str(chunk_doc_id)], # Primary key field (document_id) | |
[str(doc_id)], # Document ID field | |
[embeddings], # Vector field (embedding) | |
[float(doccontextrel)], # Relevance score field | |
[float(doccontextutil)], # Utilization score field | |
[float(docadherence)], # Adherence score field | |
[str(datasetname)], # Dataset name field | |
[float(relevance_score)], # Relevance score field | |
[float(utilization_score)], # Utilization score field | |
[float(completeness_score)] # Completeness score field | |
] | |
collection.insert(insert_data) | |
except Exception as e: | |
print(f"Error inserting chunk {chunk_doc_id} doc {doc_id} (index {index}): {e}") | |