File size: 5,614 Bytes
35d7369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Required Libraries
import fitz  # PyMuPDF
import os
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from qdrant_client import QdrantClient  # Import Qdrant client
import uuid  # Add this import at the top of your file
import json

# Load environment variables from .env file
load_dotenv()

# Initialize Qdrant client
qdrant_api_key = os.getenv("QDRANT_API_KEY")  # Get the Qdrant API key from environment variables
qdrant_client = QdrantClient(
    url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key,
    timeout=300  # 5 minutes timeout
)

# New function to load processed documents
def load_processed_docs():
    try:
        with open('processed_docs.json', 'r') as f:
            return set(json.load(f))
    except FileNotFoundError:
        return set()

# New function to save processed documents
def save_processed_docs(processed_docs):
    with open('processed_docs.json', 'w') as f:
        json.dump(list(processed_docs), f)

# Modified function to create the Qdrant collection if it doesn't exist
def create_qdrant_collection(collection_name, vector_size):
    try:
        # Check if the collection already exists
        collection_info = qdrant_client.get_collection(collection_name)
        print(f"Collection '{collection_name}' already exists.")
    except Exception as e:
        # If the collection doesn't exist, create it
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "size": vector_size,
                "distance": "Cosine"
            }
        )
        print(f"Collection '{collection_name}' created successfully.")

# Modified function to store embeddings in Qdrant
def store_embeddings_in_qdrant(embedded_chunks, collection_name):
    points = []
    for theme, embeddings in embedded_chunks.items():
        for embedding in embeddings:
            points.append({
                "id": str(uuid.uuid4()),  # Generate a UUID for each point
                "vector": embedding,
                "payload": {"theme": theme}
            })
    
    # Batch upload points
    batch_size = 100
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        try:
            qdrant_client.upsert(
                collection_name=collection_name,
                points=batch
            )
            print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
    
    print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")        

# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Define Themes
themes = [
    "Safe and Effective Systems",
    "Algorithmic Discrimination Protections",
    "Data Privacy",
    "Notice and Explanation",
    "Human Alternatives",
    "Risk Management",
    "Governance",
    "Trustworthiness",
    "Unclassified"
]

# Step 3: Chunk the Text
def chunk_text(text, themes):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    thematic_chunks = {theme: [] for theme in themes}
    thematic_chunks["Unclassified"] = []  # Add an "Unclassified" category
    
    for chunk in chunks:
        theme_found = False
        for theme in themes:
            if theme.lower() in chunk.lower():
                thematic_chunks[theme].append(chunk)
                theme_found = True
                break
        if not theme_found:
            thematic_chunks["Unclassified"].append(chunk)
    return thematic_chunks

# Step 4: Embed the Chunks
def embed_chunks(thematic_chunks):
    openai_api_key = os.getenv("OPENAI_API_KEY")  # Get the API key from environment variables
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
    embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
    return embedded_chunks

# Modified main execution block
def main():
    resources_folder = "resources"
    processed_docs = load_processed_docs()
    new_docs_processed = False

    collection_name = "ai_info_collection"
    
    for filename in os.listdir(resources_folder):
        if filename.endswith(".pdf") and filename not in processed_docs:
            pdf_path = os.path.join(resources_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            thematic_chunks = chunk_text(text, themes)
            embedded_chunks = embed_chunks(thematic_chunks)

            # Ensure the collection exists
            if not new_docs_processed:
                vector_size = len(next(iter(embedded_chunks.values()))[0])
                create_qdrant_collection(collection_name, vector_size)

            # Store embeddings for this document
            store_embeddings_in_qdrant(embedded_chunks, collection_name)
            
            processed_docs.add(filename)
            new_docs_processed = True
            print(f"Processed and added embeddings for {filename}")

    if new_docs_processed:
        save_processed_docs(processed_docs)
        print("New documents processed and added to the collection.")
    else:
        print("No new documents to process.")

if __name__ == "__main__":
    main()