Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,6 @@ import streamlit as st
|
|
10 |
from io import BytesIO
|
11 |
from numpy import ndarray
|
12 |
from llama_cpp import Llama
|
13 |
-
from charset_normalizer import from_bytes
|
14 |
from langchain_core.documents.base import Document
|
15 |
from scipy.sparse import csr_matrix, save_npz, load_npz, vstack
|
16 |
from qdrant_client import QdrantClient, models
|
@@ -407,20 +406,12 @@ if __name__ == '__main__':
|
|
407 |
hi_res_model_name='yolox',
|
408 |
include_page_breaks=True
|
409 |
)
|
410 |
-
|
411 |
-
encoding = str(
|
412 |
-
from_bytes(
|
413 |
-
bytes_content
|
414 |
-
).best()
|
415 |
-
)
|
416 |
-
print(f'Encoding: {encoding}')
|
417 |
-
uploaded_file_name = bytes_content.decode(encoding)
|
418 |
-
print(uploaded_file_name)
|
419 |
|
420 |
texts, metadatas = [], []
|
421 |
for elem in elements:
|
422 |
texts.append(elem.text)
|
423 |
-
metadatas.append(
|
424 |
|
425 |
texts, metadatas, dense_embeddings, sparse_embeddings = chunk_documents(texts, metadatas, dense_model, sparse_model)
|
426 |
|
|
|
10 |
from io import BytesIO
|
11 |
from numpy import ndarray
|
12 |
from llama_cpp import Llama
|
|
|
13 |
from langchain_core.documents.base import Document
|
14 |
from scipy.sparse import csr_matrix, save_npz, load_npz, vstack
|
15 |
from qdrant_client import QdrantClient, models
|
|
|
406 |
hi_res_model_name='yolox',
|
407 |
include_page_breaks=True
|
408 |
)
|
409 |
+
metadata_dict = {"source": uploaded_file.name}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
texts, metadatas = [], []
|
412 |
for elem in elements:
|
413 |
texts.append(elem.text)
|
414 |
+
metadatas.append(metadata_dict)
|
415 |
|
416 |
texts, metadatas, dense_embeddings, sparse_embeddings = chunk_documents(texts, metadatas, dense_model, sparse_model)
|
417 |
|