Spaces:

svb01
/

sbaiiinfo

Paused

File size: 6,035 Bytes

# Required Libraries
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import os
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from qdrant_client import QdrantClient  # Import Qdrant client
import uuid  # Add this import at the top of your file
import json
from huggingface_hub import login


# Load environment variables from .env file
load_dotenv()
login(token=os.getenv("HF_TOKEN"))


# Initialize Qdrant client
qdrant_api_key = os.getenv("QDRANT_API_KEY")  # Get the Qdrant API key from environment variables
qdrant_client = QdrantClient(
    url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key,
    timeout=300  # 5 minutes timeout
)

# New function to load processed documents
def load_processed_docs():
    try:
        with open('processed_docs.json', 'r') as f:
            return set(json.load(f))
    except FileNotFoundError:
        return set()

# New function to save processed documents
def save_processed_docs(processed_docs):
    with open('processed_docs.json', 'w') as f:
        json.dump(list(processed_docs), f)

# Modified function to create the Qdrant collection if it doesn't exist
def create_qdrant_collection(collection_name, vector_size):
    try:
        # Check if the collection already exists
        collection_info = qdrant_client.get_collection(collection_name)
        print(f"Collection '{collection_name}' already exists.")
    except Exception as e:
        # If the collection doesn't exist, create it
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "size": vector_size,
                "distance": "Cosine"
            }
        )
        print(f"Collection '{collection_name}' created successfully.")

# Modified function to store embeddings in Qdrant
def store_embeddings_in_qdrant(embedded_chunks, collection_name):
    points = []
    for theme, embeddings in embedded_chunks.items():
        for embedding in embeddings:
            points.append({
                "id": str(uuid.uuid4()),  # Generate a UUID for each point
                "vector": embedding,
                "payload": {"theme": theme}
            })
    
    # Batch upload points
    batch_size = 100
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        try:
            qdrant_client.upsert(
                collection_name=collection_name,
                points=batch
            )
            print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
    
    print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")        

# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Define Themes
themes = [
    "Safe and Effective Systems",
    "Algorithmic Discrimination Protections",
    "Data Privacy",
    "Notice and Explanation",
    "Human Alternatives",
    "Risk Management",
    "Governance",
    "Trustworthiness",
    "Unclassified"
]

# Step 3: Chunk the Text
def chunk_text(text, themes):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    thematic_chunks = {theme: [] for theme in themes}
    thematic_chunks["Unclassified"] = []  # Add an "Unclassified" category
    
    for chunk in chunks:
        theme_found = False
        for theme in themes:
            if theme.lower() in chunk.lower():
                thematic_chunks[theme].append(chunk)
                theme_found = True
                break
        if not theme_found:
            thematic_chunks["Unclassified"].append(chunk)
    return thematic_chunks

# Step 4: Embed the Chunks
def embed_chunks_openai(thematic_chunks):
    openai_api_key = os.getenv("OPENAI_API_KEY")  # Get the API key from environment variables
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
    embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
    return embedded_chunks

def embed_chunks_fine_tuned(thematic_chunks):
    model = SentenceTransformer("svb01/fine-tuned-embedding-model")
    embedded_chunks = {theme: model.encode(chunks) for theme, chunks in thematic_chunks.items()}
    return embedded_chunks

# The rest of app.py remains the same

# Modified main execution block
def main():
    resources_folder = "resources"
    processed_docs = load_processed_docs()
    new_docs_processed = False

    collection_name = "ai_info_collection"
    
    for filename in os.listdir(resources_folder):
        if filename.endswith(".pdf") and filename not in processed_docs:
            pdf_path = os.path.join(resources_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            thematic_chunks = chunk_text(text, themes)
            embedded_chunks = embed_chunks_fine_tuned(thematic_chunks)

            # Ensure the collection exists
            if not new_docs_processed:
                vector_size = len(next(iter(embedded_chunks.values()))[0])
                create_qdrant_collection(collection_name, vector_size)

            # Store embeddings for this document
            store_embeddings_in_qdrant(embedded_chunks, collection_name)
            
            processed_docs.add(filename)
            new_docs_processed = True
            print(f"Processed and added embeddings for {filename}")

    if new_docs_processed:
        save_processed_docs(processed_docs)
        print("New documents processed and added to the collection.")
    else:
        print("No new documents to process.")

if __name__ == "__main__":
    main()