Update app.py
Browse files
app.py
CHANGED
@@ -648,77 +648,71 @@ if __name__ == '__main__':
|
|
648 |
|
649 |
uploaded_files = st.file_uploader("Upload a file :", accept_multiple_files=True, type=['pptx', 'ppt'])
|
650 |
print(f'uploaded-files : {uploaded_files}')
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
full_path = os.path.realpath(uploaded_file.name)
|
658 |
-
file_type = ext.lstrip('.')
|
659 |
-
|
660 |
-
d = {
|
661 |
-
'icon': icon_to_types[file_type][0],
|
662 |
-
'document': base_name,
|
663 |
-
'type': icon_to_types[file_type][1],
|
664 |
-
'path': full_path,
|
665 |
-
'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
|
666 |
-
'toggle': True
|
667 |
-
}
|
668 |
-
|
669 |
-
st.session_state.df = pd.concat(
|
670 |
-
[st.session_state.df, pd.DataFrame(data=d)]
|
671 |
-
)
|
672 |
-
st.session_state.df.to_parquet(
|
673 |
-
os.path.join(
|
674 |
-
data_editor_path,
|
675 |
-
'data_editor.parquet.br'
|
676 |
-
),
|
677 |
-
compression='brotli',
|
678 |
-
engine='pyarrow'
|
679 |
-
)
|
680 |
-
|
681 |
-
elements = partition_pptx(file=uploaded_file)
|
682 |
-
|
683 |
-
for elem in elements:
|
684 |
-
elem.text = clean(elem.text, bullets=True)
|
685 |
-
text_type = elem.to_dict()['type']
|
686 |
-
print(f'UNSTRUCTURED TEXT: {text_type} , {elem.text}')
|
687 |
|
688 |
-
|
689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
|
696 |
-
|
697 |
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
|
|
707 |
)
|
708 |
-
)
|
709 |
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
|
|
648 |
|
649 |
uploaded_files = st.file_uploader("Upload a file :", accept_multiple_files=True, type=['pptx', 'ppt'])
|
650 |
print(f'uploaded-files : {uploaded_files}')
|
651 |
+
if uploaded_files[-1] not in uploaded_files.deleted:
|
652 |
+
for uploaded_file in uploaded_files:
|
653 |
+
|
654 |
+
processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
|
655 |
+
file_name = os.path.basename(uploaded_file.name)
|
656 |
+
base_name, ext = os.path.splitext(file_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
|
658 |
+
full_path = os.path.realpath(uploaded_file.name)
|
659 |
+
file_type = ext.lstrip('.')
|
660 |
+
|
661 |
+
d = {
|
662 |
+
'icon': icon_to_types[file_type][0],
|
663 |
+
'document': base_name,
|
664 |
+
'type': icon_to_types[file_type][1],
|
665 |
+
'path': full_path,
|
666 |
+
'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
|
667 |
+
'toggle': True
|
668 |
+
}
|
669 |
+
|
670 |
+
st.session_state.df = pd.concat(
|
671 |
+
[st.session_state.df, pd.DataFrame(data=d)]
|
672 |
+
)
|
673 |
+
st.session_state.df.to_parquet(
|
674 |
+
os.path.join(
|
675 |
+
data_editor_path,
|
676 |
+
'data_editor.parquet.br'
|
677 |
+
),
|
678 |
+
compression='brotli',
|
679 |
+
engine='pyarrow'
|
680 |
+
)
|
681 |
+
|
682 |
+
weakDict, tables = ppt_chunk(uploaded_file, nlp)
|
683 |
+
documents = weakDict.all_texts()
|
684 |
|
685 |
+
dense = dense_model.embed_documents(documents)
|
686 |
+
print(f'dense: {type(dense)}')
|
687 |
+
sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
|
688 |
+
print(f'dense: {type(sparse)}')
|
689 |
|
690 |
+
print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
|
691 |
|
692 |
+
client.upsert(
|
693 |
+
collection_name=collection_name,
|
694 |
+
points=Batch(
|
695 |
+
ids=weakDict.all_ids(),
|
696 |
+
payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
|
697 |
+
vectors={
|
698 |
+
'text-dense': dense,
|
699 |
+
'text-sparse': sparse
|
700 |
+
}
|
701 |
+
)
|
702 |
)
|
|
|
703 |
|
704 |
+
st.session_state.dialog_open = True
|
705 |
+
dialog_container = float_dialog(
|
706 |
+
show=st.session_state.dialog_open,
|
707 |
+
width=7,
|
708 |
+
background="#EEE3D3",
|
709 |
+
transition=7,
|
710 |
+
transition_from="top",
|
711 |
+
transition_to="center"
|
712 |
+
)
|
713 |
+
|
714 |
+
with dialog_container:
|
715 |
+
st.subheader("Documents Ingested !")
|
716 |
+
if st.button("Ok", key="ok"):
|
717 |
+
st.session_state.dialog_open = False
|
718 |
+
st.rerun()
|