File size: 6,035 Bytes
35d7369
0d0eac6
35d7369
 
 
 
 
 
 
 
0d0eac6
 
35d7369
 
 
0d0eac6
 
35d7369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d0eac6
35d7369
 
 
 
 
0d0eac6
 
 
 
 
 
 
35d7369
 
 
 
 
 
 
 
 
 
 
 
 
0d0eac6
35d7369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Required Libraries
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import os
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from qdrant_client import QdrantClient  # Import Qdrant client
import uuid  # Add this import at the top of your file
import json
from huggingface_hub import login


# Load environment variables from .env file
load_dotenv()
login(token=os.getenv("HF_TOKEN"))


# Initialize Qdrant client
qdrant_api_key = os.getenv("QDRANT_API_KEY")  # Get the Qdrant API key from environment variables
qdrant_client = QdrantClient(
    url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key,
    timeout=300  # 5 minutes timeout
)

# New function to load processed documents
def load_processed_docs():
    try:
        with open('processed_docs.json', 'r') as f:
            return set(json.load(f))
    except FileNotFoundError:
        return set()

# New function to save processed documents
def save_processed_docs(processed_docs):
    with open('processed_docs.json', 'w') as f:
        json.dump(list(processed_docs), f)

# Modified function to create the Qdrant collection if it doesn't exist
def create_qdrant_collection(collection_name, vector_size):
    try:
        # Check if the collection already exists
        collection_info = qdrant_client.get_collection(collection_name)
        print(f"Collection '{collection_name}' already exists.")
    except Exception as e:
        # If the collection doesn't exist, create it
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "size": vector_size,
                "distance": "Cosine"
            }
        )
        print(f"Collection '{collection_name}' created successfully.")

# Modified function to store embeddings in Qdrant
def store_embeddings_in_qdrant(embedded_chunks, collection_name):
    points = []
    for theme, embeddings in embedded_chunks.items():
        for embedding in embeddings:
            points.append({
                "id": str(uuid.uuid4()),  # Generate a UUID for each point
                "vector": embedding,
                "payload": {"theme": theme}
            })
    
    # Batch upload points
    batch_size = 100
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        try:
            qdrant_client.upsert(
                collection_name=collection_name,
                points=batch
            )
            print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
    
    print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")        

# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Define Themes
themes = [
    "Safe and Effective Systems",
    "Algorithmic Discrimination Protections",
    "Data Privacy",
    "Notice and Explanation",
    "Human Alternatives",
    "Risk Management",
    "Governance",
    "Trustworthiness",
    "Unclassified"
]

# Step 3: Chunk the Text
def chunk_text(text, themes):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    thematic_chunks = {theme: [] for theme in themes}
    thematic_chunks["Unclassified"] = []  # Add an "Unclassified" category
    
    for chunk in chunks:
        theme_found = False
        for theme in themes:
            if theme.lower() in chunk.lower():
                thematic_chunks[theme].append(chunk)
                theme_found = True
                break
        if not theme_found:
            thematic_chunks["Unclassified"].append(chunk)
    return thematic_chunks

# Step 4: Embed the Chunks
def embed_chunks_openai(thematic_chunks):
    openai_api_key = os.getenv("OPENAI_API_KEY")  # Get the API key from environment variables
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
    embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
    return embedded_chunks

def embed_chunks_fine_tuned(thematic_chunks):
    model = SentenceTransformer("svb01/fine-tuned-embedding-model")
    embedded_chunks = {theme: model.encode(chunks) for theme, chunks in thematic_chunks.items()}
    return embedded_chunks

# The rest of app.py remains the same

# Modified main execution block
def main():
    resources_folder = "resources"
    processed_docs = load_processed_docs()
    new_docs_processed = False

    collection_name = "ai_info_collection"
    
    for filename in os.listdir(resources_folder):
        if filename.endswith(".pdf") and filename not in processed_docs:
            pdf_path = os.path.join(resources_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            thematic_chunks = chunk_text(text, themes)
            embedded_chunks = embed_chunks_fine_tuned(thematic_chunks)

            # Ensure the collection exists
            if not new_docs_processed:
                vector_size = len(next(iter(embedded_chunks.values()))[0])
                create_qdrant_collection(collection_name, vector_size)

            # Store embeddings for this document
            store_embeddings_in_qdrant(embedded_chunks, collection_name)
            
            processed_docs.add(filename)
            new_docs_processed = True
            print(f"Processed and added embeddings for {filename}")

    if new_docs_processed:
        save_processed_docs(processed_docs)
        print("New documents processed and added to the collection.")
    else:
        print("No new documents to process.")

if __name__ == "__main__":
    main()