devve1 commited on
Commit
1014077
1 Parent(s): f23fe85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -68
app.py CHANGED
@@ -648,77 +648,71 @@ if __name__ == '__main__':
648
 
649
  uploaded_files = st.file_uploader("Upload a file :", accept_multiple_files=True, type=['pptx', 'ppt'])
650
  print(f'uploaded-files : {uploaded_files}')
651
- for uploaded_file in uploaded_files:
652
-
653
- processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
654
- file_name = os.path.basename(uploaded_file.name)
655
- base_name, ext = os.path.splitext(file_name)
656
-
657
- full_path = os.path.realpath(uploaded_file.name)
658
- file_type = ext.lstrip('.')
659
-
660
- d = {
661
- 'icon': icon_to_types[file_type][0],
662
- 'document': base_name,
663
- 'type': icon_to_types[file_type][1],
664
- 'path': full_path,
665
- 'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
666
- 'toggle': True
667
- }
668
-
669
- st.session_state.df = pd.concat(
670
- [st.session_state.df, pd.DataFrame(data=d)]
671
- )
672
- st.session_state.df.to_parquet(
673
- os.path.join(
674
- data_editor_path,
675
- 'data_editor.parquet.br'
676
- ),
677
- compression='brotli',
678
- engine='pyarrow'
679
- )
680
-
681
- elements = partition_pptx(file=uploaded_file)
682
-
683
- for elem in elements:
684
- elem.text = clean(elem.text, bullets=True)
685
- text_type = elem.to_dict()['type']
686
- print(f'UNSTRUCTURED TEXT: {text_type} , {elem.text}')
687
 
688
- weakDict, tables = ppt_chunk(uploaded_file, nlp)
689
- documents = weakDict.all_texts()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
 
691
- dense = dense_model.embed_documents(documents)
692
- print(f'dense: {type(dense)}')
693
- sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
694
- print(f'dense: {type(sparse)}')
695
 
696
- print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
697
 
698
- client.upsert(
699
- collection_name=collection_name,
700
- points=Batch(
701
- ids=weakDict.all_ids(),
702
- payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
703
- vectors={
704
- 'text-dense': dense,
705
- 'text-sparse': sparse
706
- }
 
707
  )
708
- )
709
 
710
- st.session_state.dialog_open = True
711
- dialog_container = float_dialog(
712
- show=st.session_state.dialog_open,
713
- width=7,
714
- background="#EEE3D3",
715
- transition=7,
716
- transition_from="top",
717
- transition_to="center"
718
- )
719
-
720
- with dialog_container:
721
- st.subheader("Documents Ingested !")
722
- if st.button("Ok", key="ok"):
723
- st.session_state.dialog_open = False
724
- st.rerun()
 
648
 
649
  uploaded_files = st.file_uploader("Upload a file :", accept_multiple_files=True, type=['pptx', 'ppt'])
650
  print(f'uploaded-files : {uploaded_files}')
651
+ if uploaded_files[-1] not in uploaded_files.deleted:
652
+ for uploaded_file in uploaded_files:
653
+
654
+ processing_time = datetime.now().strftime('%d %b %Y, %I:%M %p')
655
+ file_name = os.path.basename(uploaded_file.name)
656
+ base_name, ext = os.path.splitext(file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
 
658
+ full_path = os.path.realpath(uploaded_file.name)
659
+ file_type = ext.lstrip('.')
660
+
661
+ d = {
662
+ 'icon': icon_to_types[file_type][0],
663
+ 'document': base_name,
664
+ 'type': icon_to_types[file_type][1],
665
+ 'path': full_path,
666
+ 'time': [datetime.strptime(processing_time, '%d %b %Y, %I:%M %p')],
667
+ 'toggle': True
668
+ }
669
+
670
+ st.session_state.df = pd.concat(
671
+ [st.session_state.df, pd.DataFrame(data=d)]
672
+ )
673
+ st.session_state.df.to_parquet(
674
+ os.path.join(
675
+ data_editor_path,
676
+ 'data_editor.parquet.br'
677
+ ),
678
+ compression='brotli',
679
+ engine='pyarrow'
680
+ )
681
+
682
+ weakDict, tables = ppt_chunk(uploaded_file, nlp)
683
+ documents = weakDict.all_texts()
684
 
685
+ dense = dense_model.embed_documents(documents)
686
+ print(f'dense: {type(dense)}')
687
+ sparse = [SparseVector(indices=s.indices.tolist(), values=s.values.tolist()) for s in sparse_model.embed(documents, 32)]
688
+ print(f'dense: {type(sparse)}')
689
 
690
+ print(f'LEN: {len(documents)}, {len(weakDict.all_metadatas())}')
691
 
692
+ client.upsert(
693
+ collection_name=collection_name,
694
+ points=Batch(
695
+ ids=weakDict.all_ids(),
696
+ payloads=[{ 'text': documents[i], 'metadata': metadata } for i, metadata in enumerate(weakDict.all_metadatas())],
697
+ vectors={
698
+ 'text-dense': dense,
699
+ 'text-sparse': sparse
700
+ }
701
+ )
702
  )
 
703
 
704
+ st.session_state.dialog_open = True
705
+ dialog_container = float_dialog(
706
+ show=st.session_state.dialog_open,
707
+ width=7,
708
+ background="#EEE3D3",
709
+ transition=7,
710
+ transition_from="top",
711
+ transition_to="center"
712
+ )
713
+
714
+ with dialog_container:
715
+ st.subheader("Documents Ingested !")
716
+ if st.button("Ok", key="ok"):
717
+ st.session_state.dialog_open = False
718
+ st.rerun()