Update loaders/common.py
Browse files- loaders/common.py +63 -48
loaders/common.py
CHANGED
@@ -10,8 +10,6 @@ from stats import add_usage
|
|
10 |
def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
|
11 |
try:
|
12 |
print("=== Starting file processing ===")
|
13 |
-
print(f"Initial file details - Name: {file.name}, Size: {file.size}")
|
14 |
-
|
15 |
documents = []
|
16 |
file_name = file.name
|
17 |
file_size = file.size
|
@@ -22,45 +20,42 @@ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
|
|
22 |
|
23 |
dateshort = time.strftime("%Y%m%d")
|
24 |
|
25 |
-
#
|
26 |
-
print("=== Document Loading ===")
|
27 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
|
28 |
tmp_file.write(file.getvalue())
|
29 |
tmp_file.flush()
|
30 |
-
print(f"Temporary file created: {tmp_file.name}")
|
31 |
-
|
32 |
loader = loader_class(tmp_file.name)
|
33 |
documents = loader.load()
|
34 |
-
print(f"Number of documents after loading: {len(documents)}")
|
35 |
-
print("First document content preview:")
|
36 |
-
if documents:
|
37 |
-
print(documents[0].page_content[:200])
|
38 |
-
|
39 |
file_sha1 = compute_sha1_from_file(tmp_file.name)
|
40 |
os.remove(tmp_file.name)
|
41 |
|
42 |
-
# Debug splitting
|
43 |
-
print("\n=== Document Splitting ===")
|
44 |
chunk_size = st.session_state['chunk_size']
|
45 |
chunk_overlap = st.session_state['chunk_overlap']
|
46 |
-
print(f"Splitting with chunk_size: {chunk_size}, overlap: {chunk_overlap}")
|
47 |
-
|
48 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
49 |
documents = text_splitter.split_documents(documents)
|
50 |
-
print(f"Number of documents after splitting: {len(documents)}")
|
51 |
|
52 |
-
#
|
53 |
-
print("\n=== Creating Documents with Metadata ===")
|
54 |
docs_with_metadata = []
|
55 |
for i, doc in enumerate(documents):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
print(doc.page_content
|
60 |
-
continue
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
new_doc = Document(
|
63 |
-
page_content=
|
64 |
metadata={
|
65 |
"file_sha1": file_sha1,
|
66 |
"file_size": file_size,
|
@@ -72,34 +67,54 @@ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
|
|
72 |
}
|
73 |
)
|
74 |
docs_with_metadata.append(new_doc)
|
75 |
-
|
76 |
-
print(f"
|
77 |
-
|
78 |
|
79 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
"chunk_size": chunk_size,
|
89 |
-
"chunk_overlap": chunk_overlap
|
90 |
-
})
|
91 |
-
except Exception as e:
|
92 |
-
print(f"\n=== Vector Store Addition Error ===")
|
93 |
-
print(f"Exception: {str(e)}")
|
94 |
-
print(f"Input details:")
|
95 |
-
print(f"File name: {file_name}")
|
96 |
-
print(f"File size: {file_size}")
|
97 |
-
print(f"File SHA1: {file_sha1}")
|
98 |
-
print(f"Number of documents: {len(docs_with_metadata)}")
|
99 |
-
print(f"Vector store type: {type(vector_store).__name__}")
|
100 |
-
raise
|
101 |
|
102 |
except Exception as e:
|
103 |
print(f"\n=== General Processing Error ===")
|
104 |
print(f"Exception occurred during file processing: {str(e)}")
|
|
|
105 |
raise
|
|
|
10 |
def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
|
11 |
try:
|
12 |
print("=== Starting file processing ===")
|
|
|
|
|
13 |
documents = []
|
14 |
file_name = file.name
|
15 |
file_size = file.size
|
|
|
20 |
|
21 |
dateshort = time.strftime("%Y%m%d")
|
22 |
|
23 |
+
# Load documents
|
|
|
24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
|
25 |
tmp_file.write(file.getvalue())
|
26 |
tmp_file.flush()
|
|
|
|
|
27 |
loader = loader_class(tmp_file.name)
|
28 |
documents = loader.load()
|
|
|
|
|
|
|
|
|
|
|
29 |
file_sha1 = compute_sha1_from_file(tmp_file.name)
|
30 |
os.remove(tmp_file.name)
|
31 |
|
|
|
|
|
32 |
chunk_size = st.session_state['chunk_size']
|
33 |
chunk_overlap = st.session_state['chunk_overlap']
|
|
|
|
|
34 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
35 |
+
|
36 |
documents = text_splitter.split_documents(documents)
|
|
|
37 |
|
38 |
+
# Create documents with metadata and validate content
|
|
|
39 |
docs_with_metadata = []
|
40 |
for i, doc in enumerate(documents):
|
41 |
+
try:
|
42 |
+
# Validate content is string and not empty
|
43 |
+
if not isinstance(doc.page_content, str):
|
44 |
+
print(f"Skipping document {i}: Invalid content type {type(doc.page_content)}")
|
45 |
+
continue
|
46 |
|
47 |
+
if not doc.page_content.strip():
|
48 |
+
print(f"Skipping document {i}: Empty content")
|
49 |
+
continue
|
50 |
+
|
51 |
+
# Basic content validation
|
52 |
+
content = doc.page_content.strip()
|
53 |
+
if len(content) < 10: # Skip very short contents
|
54 |
+
print(f"Skipping document {i}: Content too short ({len(content)} chars)")
|
55 |
+
continue
|
56 |
+
|
57 |
new_doc = Document(
|
58 |
+
page_content=content,
|
59 |
metadata={
|
60 |
"file_sha1": file_sha1,
|
61 |
"file_size": file_size,
|
|
|
67 |
}
|
68 |
)
|
69 |
docs_with_metadata.append(new_doc)
|
70 |
+
except Exception as e:
|
71 |
+
print(f"Error processing document {i}: {str(e)}")
|
72 |
+
continue
|
73 |
|
74 |
+
print(f"Processed {len(docs_with_metadata)} valid documents")
|
75 |
+
|
76 |
+
# Process in smaller batches
|
77 |
+
BATCH_SIZE = 50
|
78 |
+
for i in range(0, len(docs_with_metadata), BATCH_SIZE):
|
79 |
+
batch = docs_with_metadata[i:i + BATCH_SIZE]
|
80 |
+
try:
|
81 |
+
print(f"Processing batch {i//BATCH_SIZE + 1} of {(len(docs_with_metadata) + BATCH_SIZE - 1)//BATCH_SIZE}")
|
82 |
+
# Debug embedding process
|
83 |
+
texts = [doc.page_content for doc in batch]
|
84 |
+
metadatas = [doc.metadata for doc in batch]
|
85 |
+
|
86 |
+
print(f"Sample text from batch (first 200 chars): {texts[0][:200] if texts else 'No texts'}")
|
87 |
+
|
88 |
+
# Try to get embeddings directly first
|
89 |
+
try:
|
90 |
+
embeddings = vector_store._embedding.embed_documents(texts)
|
91 |
+
print(f"Successfully generated embeddings for batch. First embedding shape: {len(embeddings[0]) if embeddings else 'No embeddings'}")
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Embedding error: {str(e)}")
|
94 |
+
print(f"Embedding type: {type(vector_store._embedding).__name__}")
|
95 |
+
# You might want to add retry logic here
|
96 |
+
raise
|
97 |
+
|
98 |
+
vector_store.add_documents(batch)
|
99 |
+
print(f"Successfully added batch to vector store")
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
print(f"Error processing batch {i//BATCH_SIZE + 1}: {str(e)}")
|
103 |
+
print(f"First document in failed batch (truncated):")
|
104 |
+
if batch:
|
105 |
+
print(batch[0].page_content[:200])
|
106 |
+
raise
|
107 |
|
108 |
+
if stats_db:
|
109 |
+
add_usage(stats_db, "embedding", "file", metadata={
|
110 |
+
"file_name": file_name,
|
111 |
+
"file_type": file_suffix,
|
112 |
+
"chunk_size": chunk_size,
|
113 |
+
"chunk_overlap": chunk_overlap
|
114 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
except Exception as e:
|
117 |
print(f"\n=== General Processing Error ===")
|
118 |
print(f"Exception occurred during file processing: {str(e)}")
|
119 |
+
print(f"Exception type: {type(e).__name__}")
|
120 |
raise
|