devve1 commited on
Commit
f914f00
β€’
1 Parent(s): 0b0b8b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -65
app.py CHANGED
@@ -585,8 +585,8 @@ if __name__ == '__main__':
585
  packed_bytes = msgpack.packb(conversations, use_bin_type=True)
586
  fp.write(packed_bytes)
587
 
588
- if "dialog_open" not in st.session_state:
589
- st.session_state.dialog_open = False
590
 
591
  with st.sidebar:
592
  st.divider()
@@ -639,7 +639,7 @@ if __name__ == '__main__':
639
  )
640
  )
641
 
642
- st.session_state.dialog_open = True
643
 
644
  st.divider()
645
 
@@ -647,75 +647,64 @@ if __name__ == '__main__':
647
 
648
  for uploaded_file in uploaded_files:
649
 
650
- file_name = os.path.basename(uploaded_file.name)
651
- base_name, ext = os.path.splitext(file_name)
652
- print(f'session state : {st.session_state.df.keys}')
 
 
 
653
 
654
- processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
655
 
656
- full_path = os.path.realpath(uploaded_file.name)
657
- file_type = ext.lstrip('.')
658
 
659
- d = {
660
- 'icon': icon_to_types[file_type][0],
661
- 'document': base_name,
662
- 'type': icon_to_types[file_type][1],
663
- 'path': full_path,
664
- 'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
665
- 'toggle': True
666
- }
667
-
668
- if (st.session_state.df.empty) or (base_name not in st.session_state.df['document'].tolist()):
669
- st.session_state.df = pd.concat(
670
- [st.session_state.df, pd.DataFrame(data=d)]
671
- )
672
- else:
673
- idx = st.session_state.df.index[st.session_state.df['document']==base_name].tolist()[0]
674
- st.session_state.df.loc[idx] = d
675
 
676
- st.session_state.df.to_parquet(
677
- os.path.join(
678
- data_editor_path,
679
- 'data_editor.parquet.sz'
680
- ),
681
- compression='snappy',
682
- engine='pyarrow'
683
- )
684
 
685
- weakDict, tables = ppt_chunk(uploaded_file, nlp)
686
- documents = weakDict.all_texts()
 
 
 
 
 
 
 
 
 
687
 
688
- dense = dense_model.embed_documents(documents)
689
- print(f'dense: {type(dense)}')
690
- sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
691
- print(f'dense: {type(sparse)}')
692
 
693
- print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
694
 
695
- client.upsert(
696
- collection_name=collection_name,
697
- points=Batch(
698
- ids=weakDict.all_ids(),
699
- payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
700
- vectors={
701
- 'text-dense': dense,
702
- 'text-sparse': sparse
703
- }
 
704
  )
705
- )
706
 
707
- st.session_state.dialog_open = True
708
- dialog_container = float_dialog(
709
- show=st.session_state.dialog_open,
710
- width=7,
711
- background="#EEE3D3",
712
- transition=20,
713
- transition_from="top",
714
- transition_to="center"
715
- )
716
-
717
- with dialog_container:
718
- st.subheader("Documents Ingested !")
719
- if st.button("Ok", key="ok"):
720
- st.session_state.dialog_open = False
721
- st.rerun()
 
585
  packed_bytes = msgpack.packb(conversations, use_bin_type=True)
586
  fp.write(packed_bytes)
587
 
588
+ if "cached_files" not in st.session_state:
589
+ st.session_state.cached_files = []
590
 
591
  with st.sidebar:
592
  st.divider()
 
639
  )
640
  )
641
 
642
+ st.toast('URL Content Ingested !', icon='πŸŽ‰')
643
 
644
  st.divider()
645
 
 
647
 
648
  for uploaded_file in uploaded_files:
649
 
650
+ if uploaded_file not in st.session_state.cached_files:
651
+ st.session_state.cached_files.append(uploaded_file)
652
+
653
+ file_name = os.path.basename(uploaded_file.name)
654
+ base_name, ext = os.path.splitext(file_name)
655
+ print(f'session state : {st.session_state.df.keys}')
656
 
657
+ processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
658
 
659
+ full_path = os.path.realpath(uploaded_file.name)
660
+ file_type = ext.lstrip('.')
661
 
662
+ d = {
663
+ 'icon': icon_to_types[file_type][0],
664
+ 'document': base_name,
665
+ 'type': icon_to_types[file_type][1],
666
+ 'path': full_path,
667
+ 'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
668
+ 'toggle': True
669
+ }
 
 
 
 
 
 
 
 
670
 
671
+ if (st.session_state.df.empty) or (base_name not in st.session_state.df['document'].tolist()):
672
+ st.session_state.df = pd.concat(
673
+ [st.session_state.df, pd.DataFrame(data=d)]
674
+ )
675
+ else:
676
+ idx = st.session_state.df.index[st.session_state.df['document']==base_name].tolist()[0]
677
+ st.session_state.df.loc[idx] = d
 
678
 
679
+ st.session_state.df.to_parquet(
680
+ os.path.join(
681
+ data_editor_path,
682
+ 'data_editor.parquet.sz'
683
+ ),
684
+ compression='snappy',
685
+ engine='pyarrow'
686
+ )
687
+
688
+ weakDict, tables = ppt_chunk(uploaded_file, nlp)
689
+ documents = weakDict.all_texts()
690
 
691
+ dense = dense_model.embed_documents(documents)
692
+ print(f'dense: {type(dense)}')
693
+ sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
694
+ print(f'dense: {type(sparse)}')
695
 
696
+ print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
697
 
698
+ client.upsert(
699
+ collection_name=collection_name,
700
+ points=Batch(
701
+ ids=weakDict.all_ids(),
702
+ payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
703
+ vectors={
704
+ 'text-dense': dense,
705
+ 'text-sparse': sparse
706
+ }
707
+ )
708
  )
 
709
 
710
+ st.toast('Document(s) Ingested !', icon='πŸŽ‰')