Spaces:
Running
Running
File size: 6,395 Bytes
9a8353d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import nltk
import pandas as pd
import numpy as np
nltk.data.path.append("/content/nltk_data")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
#Splits a list of sentences into overlapping chunks using a sliding window approach.
#sentences (list): List of sentences to split into chunks.
# window_size (int): Number of sentences in each chunk. Default is 6.
# overlap (int): Number of overlapping sentences between consecutive chunks. Default is 3.
# Returns:
# list: List of text chunks, where each chunk is a string of concatenated sentences.
def split_into_sliding_windows(sentences, window_size=6, overlap=3):
# Validate input parameters
if window_size <= overlap:
raise ValueError("window_size must be greater than overlap.")
if not sentences:
return []
chunks = []
step = window_size - overlap # How much to move the window each time
# Iterate over the sentences with the specified step size
for i in range(0, len(sentences), step):
chunk = sentences[i:i + window_size]
if len(chunk) >= overlap: # Ensure chunks have minimum required overlap
chunks.append(" ".join(chunk)) # Join sentences into a text block
return chunks
# Processes documents using a sliding window approach and inserts sentence chunks into Milvus.
#Args: model: The embedding model used to generate document embeddings.
# extracted_data: Pandas DataFrame containing the extracted data.
# collectionInstance: Milvus collection instance to insert data into.
# window_size: Number of sentences in each chunk.
# overlap: Number of overlapping sentences between consecutive chunks.
#
def EmbedAllDocumentsAndInsert(model, extracted_data, collectionInstance, window_size=5, overlap=2):
count = 0
total_docs = len(extracted_data)
print(f"Total documents: {total_docs}")
for index, row in extracted_data.iterrows():
document = row["documents"] # Extract the document text
doc_id = row["id"] # Extract the document ID
doccontextrel = row["gpt3_context_relevance"] # Extract context relevance score
doccontextutil = row["gpt35_utilization"] # Extract context utilization score
docadherence = row["gpt3_adherence"] # Extract adherence score
datasetname = row["dataset_name"] # Extract dataset name
relevance_score = row["relevance_score"] # Extract relevance score
utilization_score = row["utilization_score"] # Extract utilization score
completeness_score = row["completeness_score"] # Extract completeness score
if isinstance(document, list):
# Flatten the list into a single string
document = " ".join([str(item) for item in document if isinstance(item, str)])
elif not isinstance(document, str):
# If the document is not a string or list, convert it to a string
document = str(document)
# Step 1: Tokenize document into sentences
sentences = sent_tokenize(document) if isinstance(document, str) else document
# Step 2: Generate overlapping chunks
chunks = split_into_sliding_windows(sentences, window_size, overlap)
print(f"Total chunks for document {index}: {len(chunks)}")
for chunk_index, chunk_text in enumerate(chunks):
# Step 3: Generate embedding for each chunk
chunk_vector = np.array(model.encode(chunk_text), dtype=np.float32).flatten().tolist()
print(f"chunk_index= {chunk_index}")
# Step 4: Insert chunk into Milvus as separate columns
insert_embeddings_into_milvus(
collectionInstance,
chunk_vector,
f"{chunk_index}__{doc_id}", # Unique ID for chunk
doc_id, # Unique ID for doc
index,
float(doccontextrel) if pd.notna(doccontextrel) else 0.0, # Handle NaN values
float(doccontextutil) if pd.notna(doccontextutil) else 0.0, # Handle NaN values
float(docadherence) if pd.notna(docadherence) else 0.0, # Handle NaN values
datasetname, # Dataset name column
float(relevance_score) if pd.notna(relevance_score) else 0.0, # Handle NaN values
float(utilization_score) if pd.notna(utilization_score) else 0.0, # Handle NaN values
float(completeness_score) if pd.notna(completeness_score) else 0.0 # Handle NaN values
)
count += 1
if count % 1000 == 0:
print(f"Uploaded {count} chunks to Milvus.")
# Inserts document embeddings into Milvus along with metadata.
#Args:
# collection: Milvus collection instance.
# embeddings: Embedding vector for the chunk.
# chunk_doc_id: Unique ID for the chunk.
# doc_id: Unique ID for the document.
# index: Index of the document in the dataset.
# doccontextrel: Context relevance score.
# doccontextutil: Context utilization score.
# docadherence: Adherence score.
# datasetname: Name of the dataset.
def insert_embeddings_into_milvus(collection, embeddings, chunk_doc_id, doc_id, index,
doccontextrel, doccontextutil, docadherence, datasetname,
relevance_score, utilization_score, completeness_score):
try:
print(f"Inserting chunk {chunk_doc_id} doc {doc_id} (index {index})")
insert_data = [
[str(chunk_doc_id)], # Primary key field (document_id)
[str(doc_id)], # Document ID field
[embeddings], # Vector field (embedding)
[float(doccontextrel)], # Relevance score field
[float(doccontextutil)], # Utilization score field
[float(docadherence)], # Adherence score field
[str(datasetname)], # Dataset name field
[float(relevance_score)], # Relevance score field
[float(utilization_score)], # Utilization score field
[float(completeness_score)] # Completeness score field
]
collection.insert(insert_data)
except Exception as e:
print(f"Error inserting chunk {chunk_doc_id} doc {doc_id} (index {index}): {e}")
|