Spaces:

ssaiteja16
/

RagBenchCapstone10

Running

File size: 6,395 Bytes

9a8353d

import nltk
import pandas as pd
import numpy as np
nltk.data.path.append("/content/nltk_data")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize





#Splits a list of sentences into overlapping chunks using a sliding window approach.
#sentences (list): List of sentences to split into chunks.
#        window_size (int): Number of sentences in each chunk. Default is 6.
#        overlap (int): Number of overlapping sentences between consecutive chunks. Default is 3.
#    Returns:
#        list: List of text chunks, where each chunk is a string of concatenated sentences.

def split_into_sliding_windows(sentences, window_size=6, overlap=3):

    # Validate input parameters
    if window_size <= overlap:
        raise ValueError("window_size must be greater than overlap.")
    if not sentences:
        return []

    chunks = []
    step = window_size - overlap  # How much to move the window each time

    # Iterate over the sentences with the specified step size
    for i in range(0, len(sentences), step):
        chunk = sentences[i:i + window_size]
        if len(chunk) >= overlap:  # Ensure chunks have minimum required overlap
            chunks.append(" ".join(chunk))  # Join sentences into a text block

    return chunks

# Processes documents using a sliding window approach and inserts sentence chunks into Milvus.
#Args: model: The embedding model used to generate document embeddings.
#   extracted_data: Pandas DataFrame containing the extracted data.
#    collectionInstance: Milvus collection instance to insert data into.
#    window_size: Number of sentences in each chunk.
#    overlap: Number of overlapping sentences between consecutive chunks.
#

def EmbedAllDocumentsAndInsert(model, extracted_data, collectionInstance, window_size=5, overlap=2):

    count = 0
    total_docs = len(extracted_data)
    print(f"Total documents: {total_docs}")

    for index, row in extracted_data.iterrows():
        document = row["documents"]  # Extract the document text
        doc_id = row["id"]  # Extract the document ID
        doccontextrel = row["gpt3_context_relevance"]  # Extract context relevance score
        doccontextutil = row["gpt35_utilization"]  # Extract context utilization score
        docadherence = row["gpt3_adherence"]  # Extract adherence score
        datasetname = row["dataset_name"]  # Extract dataset name
        relevance_score = row["relevance_score"]  # Extract relevance score
        utilization_score = row["utilization_score"]  # Extract utilization score
        completeness_score = row["completeness_score"]  # Extract completeness score


        if isinstance(document, list):
            # Flatten the list into a single string
            document = " ".join([str(item) for item in document if isinstance(item, str)])
        elif not isinstance(document, str):
            # If the document is not a string or list, convert it to a string
            document = str(document)

        # Step 1: Tokenize document into sentences
        sentences = sent_tokenize(document) if isinstance(document, str) else document

        # Step 2: Generate overlapping chunks
        chunks = split_into_sliding_windows(sentences, window_size, overlap)

        print(f"Total chunks for document {index}: {len(chunks)}")

        for chunk_index, chunk_text in enumerate(chunks):
            # Step 3: Generate embedding for each chunk
            chunk_vector = np.array(model.encode(chunk_text), dtype=np.float32).flatten().tolist()

            print(f"chunk_index= {chunk_index}")

            # Step 4: Insert chunk into Milvus as separate columns
            insert_embeddings_into_milvus(
                collectionInstance,
                chunk_vector,
                f"{chunk_index}__{doc_id}",  # Unique ID for chunk
                doc_id,  # Unique ID for doc
                index,
                float(doccontextrel) if pd.notna(doccontextrel) else 0.0,  # Handle NaN values
                float(doccontextutil) if pd.notna(doccontextutil) else 0.0,  # Handle NaN values
                float(docadherence) if pd.notna(docadherence) else 0.0,  # Handle NaN values
                datasetname,  # Dataset name column
                float(relevance_score) if pd.notna(relevance_score) else 0.0,  # Handle NaN values
                float(utilization_score) if pd.notna(utilization_score) else 0.0,  # Handle NaN values
                float(completeness_score) if pd.notna(completeness_score) else 0.0  # Handle NaN values
            )

            count += 1
            if count % 1000 == 0:
                print(f"Uploaded {count} chunks to Milvus.")

# Inserts document embeddings into Milvus along with metadata.
#Args:
#        collection: Milvus collection instance.
#        embeddings: Embedding vector for the chunk.
#        chunk_doc_id: Unique ID for the chunk.
#        doc_id: Unique ID for the document.
#       index: Index of the document in the dataset.
#        doccontextrel: Context relevance score.
#        doccontextutil: Context utilization score.
#       docadherence: Adherence score.
#       datasetname: Name of the dataset.

def insert_embeddings_into_milvus(collection, embeddings, chunk_doc_id, doc_id, index,
                                  doccontextrel, doccontextutil, docadherence, datasetname,
                                  relevance_score, utilization_score, completeness_score):

    try:
        print(f"Inserting chunk {chunk_doc_id} doc {doc_id} (index {index})")
        insert_data = [
            [str(chunk_doc_id)],  # Primary key field (document_id)
            [str(doc_id)],  # Document ID field
            [embeddings],  # Vector field (embedding)
            [float(doccontextrel)],  # Relevance score field
            [float(doccontextutil)],  # Utilization score field
            [float(docadherence)],  # Adherence score field
            [str(datasetname)],  # Dataset name field
            [float(relevance_score)],  # Relevance score field
            [float(utilization_score)],  # Utilization score field
            [float(completeness_score)]  # Completeness score field
        ]
        collection.insert(insert_data)
    except Exception as e:
        print(f"Error inserting chunk {chunk_doc_id} doc {doc_id} (index {index}): {e}")