Update app.py
Browse files
app.py
CHANGED
@@ -585,8 +585,8 @@ if __name__ == '__main__':
|
|
585 |
packed_bytes = msgpack.packb(conversations, use_bin_type=True)
|
586 |
fp.write(packed_bytes)
|
587 |
|
588 |
-
if "
|
589 |
-
st.session_state.
|
590 |
|
591 |
with st.sidebar:
|
592 |
st.divider()
|
@@ -639,7 +639,7 @@ if __name__ == '__main__':
|
|
639 |
)
|
640 |
)
|
641 |
|
642 |
-
st.
|
643 |
|
644 |
st.divider()
|
645 |
|
@@ -647,75 +647,64 @@ if __name__ == '__main__':
|
|
647 |
|
648 |
for uploaded_file in uploaded_files:
|
649 |
|
650 |
-
|
651 |
-
|
652 |
-
|
|
|
|
|
|
|
653 |
|
654 |
-
|
655 |
|
656 |
-
|
657 |
-
|
658 |
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
if (st.session_state.df.empty) or (base_name not in st.session_state.df['document'].tolist()):
|
669 |
-
st.session_state.df = pd.concat(
|
670 |
-
[st.session_state.df, pd.DataFrame(data=d)]
|
671 |
-
)
|
672 |
-
else:
|
673 |
-
idx = st.session_state.df.index[st.session_state.df['document']==base_name].tolist()[0]
|
674 |
-
st.session_state.df.loc[idx] = d
|
675 |
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
)
|
684 |
|
685 |
-
|
686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
|
693 |
-
|
694 |
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
|
|
704 |
)
|
705 |
-
)
|
706 |
|
707 |
-
|
708 |
-
dialog_container = float_dialog(
|
709 |
-
show=st.session_state.dialog_open,
|
710 |
-
width=7,
|
711 |
-
background="#EEE3D3",
|
712 |
-
transition=20,
|
713 |
-
transition_from="top",
|
714 |
-
transition_to="center"
|
715 |
-
)
|
716 |
-
|
717 |
-
with dialog_container:
|
718 |
-
st.subheader("Documents Ingested !")
|
719 |
-
if st.button("Ok", key="ok"):
|
720 |
-
st.session_state.dialog_open = False
|
721 |
-
st.rerun()
|
|
|
585 |
packed_bytes = msgpack.packb(conversations, use_bin_type=True)
|
586 |
fp.write(packed_bytes)
|
587 |
|
588 |
+
if "cached_files" not in st.session_state:
|
589 |
+
st.session_state.cached_files = []
|
590 |
|
591 |
with st.sidebar:
|
592 |
st.divider()
|
|
|
639 |
)
|
640 |
)
|
641 |
|
642 |
+
st.toast('URL Content Ingested !', icon='π')
|
643 |
|
644 |
st.divider()
|
645 |
|
|
|
647 |
|
648 |
for uploaded_file in uploaded_files:
|
649 |
|
650 |
+
if uploaded_file not in st.session_state.cached_files:
|
651 |
+
st.session_state.cached_files.append(uploaded_file)
|
652 |
+
|
653 |
+
file_name = os.path.basename(uploaded_file.name)
|
654 |
+
base_name, ext = os.path.splitext(file_name)
|
655 |
+
print(f'session state : {st.session_state.df.keys}')
|
656 |
|
657 |
+
processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
|
658 |
|
659 |
+
full_path = os.path.realpath(uploaded_file.name)
|
660 |
+
file_type = ext.lstrip('.')
|
661 |
|
662 |
+
d = {
|
663 |
+
'icon': icon_to_types[file_type][0],
|
664 |
+
'document': base_name,
|
665 |
+
'type': icon_to_types[file_type][1],
|
666 |
+
'path': full_path,
|
667 |
+
'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
|
668 |
+
'toggle': True
|
669 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
670 |
|
671 |
+
if (st.session_state.df.empty) or (base_name not in st.session_state.df['document'].tolist()):
|
672 |
+
st.session_state.df = pd.concat(
|
673 |
+
[st.session_state.df, pd.DataFrame(data=d)]
|
674 |
+
)
|
675 |
+
else:
|
676 |
+
idx = st.session_state.df.index[st.session_state.df['document']==base_name].tolist()[0]
|
677 |
+
st.session_state.df.loc[idx] = d
|
|
|
678 |
|
679 |
+
st.session_state.df.to_parquet(
|
680 |
+
os.path.join(
|
681 |
+
data_editor_path,
|
682 |
+
'data_editor.parquet.sz'
|
683 |
+
),
|
684 |
+
compression='snappy',
|
685 |
+
engine='pyarrow'
|
686 |
+
)
|
687 |
+
|
688 |
+
weakDict, tables = ppt_chunk(uploaded_file, nlp)
|
689 |
+
documents = weakDict.all_texts()
|
690 |
|
691 |
+
dense = dense_model.embed_documents(documents)
|
692 |
+
print(f'dense: {type(dense)}')
|
693 |
+
sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
|
694 |
+
print(f'dense: {type(sparse)}')
|
695 |
|
696 |
+
print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
|
697 |
|
698 |
+
client.upsert(
|
699 |
+
collection_name=collection_name,
|
700 |
+
points=Batch(
|
701 |
+
ids=weakDict.all_ids(),
|
702 |
+
payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
|
703 |
+
vectors={
|
704 |
+
'text-dense': dense,
|
705 |
+
'text-sparse': sparse
|
706 |
+
}
|
707 |
+
)
|
708 |
)
|
|
|
709 |
|
710 |
+
st.toast('Document(s) Ingested !', icon='π')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|