Spaces:

meraGPT
/

meraKB

Sleeping

App Files Files Community

codelion commited on 6 days ago

Commit

7b99e28

verified ·

1 Parent(s): b84dd14

Update loaders/common.py

Browse files

Files changed (1) hide show

loaders/common.py +63 -48

loaders/common.py CHANGED Viewed

@@ -10,8 +10,6 @@ from stats import add_usage
 def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
     try:
         print("=== Starting file processing ===")
-        print(f"Initial file details - Name: {file.name}, Size: {file.size}")
         documents = []
         file_name = file.name
         file_size = file.size
@@ -22,45 +20,42 @@ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
         dateshort = time.strftime("%Y%m%d")
-        # Debug loading
-        print("=== Document Loading ===")
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
             tmp_file.write(file.getvalue())
             tmp_file.flush()
-            print(f"Temporary file created: {tmp_file.name}")
             loader = loader_class(tmp_file.name)
             documents = loader.load()
-            print(f"Number of documents after loading: {len(documents)}")
-            print("First document content preview:")
-            if documents:
-                print(documents[0].page_content[:200])
             file_sha1 = compute_sha1_from_file(tmp_file.name)
         os.remove(tmp_file.name)
-        # Debug splitting
-        print("\n=== Document Splitting ===")
         chunk_size = st.session_state['chunk_size']
         chunk_overlap = st.session_state['chunk_overlap']
-        print(f"Splitting with chunk_size: {chunk_size}, overlap: {chunk_overlap}")
         text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         documents = text_splitter.split_documents(documents)
-        print(f"Number of documents after splitting: {len(documents)}")
-        # Debug metadata creation
-        print("\n=== Creating Documents with Metadata ===")
         docs_with_metadata = []
         for i, doc in enumerate(documents):
-            if isinstance(doc.page_content, str):
-                if "error" in doc.page_content.lower():
-                    print(f"WARNING: Found potential error message in document {i}:")
-                    print(doc.page_content[:200])
-                    continue  # Skip this document
                 new_doc = Document(
-                    page_content=doc.page_content,
                     metadata={
                         "file_sha1": file_sha1,
                         "file_size": file_size,
@@ -72,34 +67,54 @@ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
                     }
                 )
                 docs_with_metadata.append(new_doc)
-            else:
-                print(f"WARNING: Document {i} has non-string content type: {type(doc.page_content)}")
-                print(f"Content: {str(doc.page_content)[:200]}")
-        print(f"Final number of documents to be added: {len(docs_with_metadata)}")
-        # Vector store addition
-        try:
-            vector_store.add_documents(docs_with_metadata)
-            if stats_db:
-                add_usage(stats_db, "embedding", "file", metadata={
-                    "file_name": file_name,
-                    "file_type": file_suffix,
-                    "chunk_size": chunk_size,
-                    "chunk_overlap": chunk_overlap
-                })
-        except Exception as e:
-            print(f"\n=== Vector Store Addition Error ===")
-            print(f"Exception: {str(e)}")
-            print(f"Input details:")
-            print(f"File name: {file_name}")
-            print(f"File size: {file_size}")
-            print(f"File SHA1: {file_sha1}")
-            print(f"Number of documents: {len(docs_with_metadata)}")
-            print(f"Vector store type: {type(vector_store).__name__}")
-            raise
     except Exception as e:
         print(f"\n=== General Processing Error ===")
         print(f"Exception occurred during file processing: {str(e)}")
         raise

 def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
     try:
         print("=== Starting file processing ===")
         documents = []
         file_name = file.name
         file_size = file.size
         dateshort = time.strftime("%Y%m%d")
+        # Load documents
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
             tmp_file.write(file.getvalue())
             tmp_file.flush()
             loader = loader_class(tmp_file.name)
             documents = loader.load()
             file_sha1 = compute_sha1_from_file(tmp_file.name)
         os.remove(tmp_file.name)
         chunk_size = st.session_state['chunk_size']
         chunk_overlap = st.session_state['chunk_overlap']
         text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         documents = text_splitter.split_documents(documents)
+        # Create documents with metadata and validate content
         docs_with_metadata = []
         for i, doc in enumerate(documents):
+            try:
+                # Validate content is string and not empty
+                if not isinstance(doc.page_content, str):
+                    print(f"Skipping document {i}: Invalid content type {type(doc.page_content)}")
+                    continue
+                if not doc.page_content.strip():
+                    print(f"Skipping document {i}: Empty content")
+                    continue
+                # Basic content validation
+                content = doc.page_content.strip()
+                if len(content) < 10:  # Skip very short contents
+                    print(f"Skipping document {i}: Content too short ({len(content)} chars)")
+                    continue
                 new_doc = Document(
+                    page_content=content,
                     metadata={
                         "file_sha1": file_sha1,
                         "file_size": file_size,
                     }
                 )
                 docs_with_metadata.append(new_doc)
+            except Exception as e:
+                print(f"Error processing document {i}: {str(e)}")
+                continue
+        print(f"Processed {len(docs_with_metadata)} valid documents")
+        # Process in smaller batches
+        BATCH_SIZE = 50
+        for i in range(0, len(docs_with_metadata), BATCH_SIZE):
+            batch = docs_with_metadata[i:i + BATCH_SIZE]
+            try:
+                print(f"Processing batch {i//BATCH_SIZE + 1} of {(len(docs_with_metadata) + BATCH_SIZE - 1)//BATCH_SIZE}")
+                # Debug embedding process
+                texts = [doc.page_content for doc in batch]
+                metadatas = [doc.metadata for doc in batch]
+                print(f"Sample text from batch (first 200 chars): {texts[0][:200] if texts else 'No texts'}")
+                # Try to get embeddings directly first
+                try:
+                    embeddings = vector_store._embedding.embed_documents(texts)
+                    print(f"Successfully generated embeddings for batch. First embedding shape: {len(embeddings[0]) if embeddings else 'No embeddings'}")
+                except Exception as e:
+                    print(f"Embedding error: {str(e)}")
+                    print(f"Embedding type: {type(vector_store._embedding).__name__}")
+                    # You might want to add retry logic here
+                    raise
+                vector_store.add_documents(batch)
+                print(f"Successfully added batch to vector store")
+            except Exception as e:
+                print(f"Error processing batch {i//BATCH_SIZE + 1}: {str(e)}")
+                print(f"First document in failed batch (truncated):")
+                if batch:
+                    print(batch[0].page_content[:200])
+                raise
+        if stats_db:
+            add_usage(stats_db, "embedding", "file", metadata={
+                "file_name": file_name,
+                "file_type": file_suffix,
+                "chunk_size": chunk_size,
+                "chunk_overlap": chunk_overlap
+            })
     except Exception as e:
         print(f"\n=== General Processing Error ===")
         print(f"Exception occurred during file processing: {str(e)}")
+        print(f"Exception type: {type(e).__name__}")
         raise