vivekvar commited on
Commit
3cb2c45
·
verified ·
1 Parent(s): 6bcb648

Upload 8 files

Browse files
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate
3
+ from llama_index.llms.huggingface import HuggingFaceInferenceAPI
4
+ from dotenv import load_dotenv
5
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ from llama_index.core import Settings
7
+ import os
8
+ import base64
9
+ import altair as alt
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configure the Llama index settings
15
+ Settings.llm = HuggingFaceInferenceAPI(
16
+ model_name="google/gemma-1.1-7b-it",
17
+ tokenizer_name="google/gemma-1.1-7b-it",
18
+ context_window=3000,
19
+ token=os.getenv("HF_TOKEN"),
20
+ max_new_tokens=512,
21
+ generate_kwargs={"temperature": 0.1},
22
+ )
23
+ Settings.embed_model = HuggingFaceEmbedding(
24
+ model_name="BAAI/bge-small-en-v1.5"
25
+ )
26
+
27
+ # Define the directory for persistent storage and data
28
+ PERSIST_DIR = "./db"
29
+ DATA_DIR = "data"
30
+
31
+ # Ensure data directory exists
32
+ os.makedirs(DATA_DIR, exist_ok=True)
33
+ os.makedirs(PERSIST_DIR, exist_ok=True)
34
+
35
+ def displayPDF(file):
36
+ with open(file, "rb") as f:
37
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
38
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
39
+ st.markdown(pdf_display, unsafe_allow_html=True)
40
+
41
+ def data_ingestion():
42
+ documents = SimpleDirectoryReader(DATA_DIR).load_data()
43
+ storage_context = StorageContext.from_defaults()
44
+ index = VectorStoreIndex.from_documents(documents)
45
+ index.storage_context.persist(persist_dir=PERSIST_DIR)
46
+
47
+ def handle_query(query):
48
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
49
+ index = load_index_from_storage(storage_context)
50
+ chat_text_qa_msgs = [
51
+ (
52
+ "user",
53
+ """created by vivek created for Neonflake Enterprises OPC Pvt Ltd
54
+ Context:
55
+ {context_str}
56
+ Question:
57
+ {query_str}
58
+ """
59
+ )
60
+ ]
61
+ text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
62
+
63
+ query_engine = index.as_query_engine(text_qa_template=text_qa_template)
64
+ answer = query_engine.query(query)
65
+
66
+ if hasattr(answer, 'response'):
67
+ return answer.response
68
+ elif isinstance(answer, dict) and 'response' in answer:
69
+ return answer['response']
70
+ else:
71
+ return "Sorry, I couldn't find an answer."
72
+
73
+
74
+ # Streamlit app initialization
75
+ st.title("Chat with your PDF📄")
76
+ st.markdown("Built by [vivek](https://github.com/saravivek-cyber)")
77
+ st.markdown("chat here")
78
+
79
+ if 'messages' not in st.session_state:
80
+ st.session_state.messages = [{'role': 'assistant', "content": 'Hello! Upload a PDF and ask me anything about its content.'}]
81
+
82
+ with st.sidebar:
83
+ st.title("Menu:")
84
+ uploaded_file = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button")
85
+ if st.button("Submit & Process"):
86
+ with st.spinner("Processing..."):
87
+ filepath = "data/saved_pdf.pdf"
88
+ with open(filepath, "wb") as f:
89
+ f.write(uploaded_file.getbuffer())
90
+ # displayPDF(filepath) # Display the uploaded PDF
91
+ data_ingestion() # Process PDF every time new file is uploaded
92
+ st.success("Done")
93
+
94
+ user_prompt = st.chat_input("Ask me anything about the content of the PDF:")
95
+ if user_prompt:
96
+ st.session_state.messages.append({'role': 'user', "content": user_prompt})
97
+ response = handle_query(user_prompt)
98
+ st.session_state.messages.append({'role': 'assistant', "content": response})
99
+
100
+ for message in st.session_state.messages:
101
+ with st.chat_message(message['role']):
102
+ st.write(message['content'])
data/saved_pdf.pdf ADDED
Binary file (64.9 kB). View file
 
db/default__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {"860863cb-ba90-4287-9ff3-de812a7cf04a": [-0.04935232177376747, 0.014276032336056232, -0.00677516171708703, -0.007499701343476772, 0.0857427641749382, 0.03596746549010277, -0.06747723370790482, 0.035292088985443115, 0.0009802288841456175, 0.011373339220881462, 0.024754010140895844, -0.03062259964644909, 0.011178512126207352, 0.01658494584262371, -0.025193003937602043, 0.007735987193882465, -0.026199763640761375, -0.004099538084119558, 0.03002157434821129, 0.018483860418200493, 0.008464190177619457, -0.03511177375912666, 0.021300099790096283, 0.045198313891887665, 0.02103796787559986, 0.04658903181552887, -0.03597500920295715, -0.07936180382966995, -0.048395734280347824, -0.23985832929611206, 0.05988171696662903, 0.00787472166121006, 0.06542612612247467, 0.016137035563588142, -0.029961371794342995, 0.016481216996908188, -0.020172743126749992, 0.05845439434051514, 0.029444292187690735, 0.0327472984790802, -0.060793109238147736, -0.021447496488690376, -0.010194200091063976, -0.03895203024148941, 0.002724932273849845, -0.04428384453058243, -0.04992429539561272, -0.020091135054826736, -0.008397594094276428, 0.006678513251245022, -0.04897148534655571, -0.060120902955532074, 0.00495715020224452, 0.031236156821250916, -0.024438021704554558, -0.001887921360321343, 0.08040959388017654, 0.0212861318141222, 0.019926222041249275, 0.041057415306568146, 0.0645257979631424, 0.029546035453677177, -0.13904771208763123, 0.01744740828871727, 0.06266944855451584, 0.041324276477098465, -0.0977046936750412, -0.042026087641716, -0.02397708222270012, 0.000938264187425375, -0.031055675819516182, 0.022282985970377922, 0.039871398359537125, 0.011208692565560341, 0.059094201773405075, 0.01761738955974579, 0.03747517243027687, 0.002703616861253977, 0.015252131968736649, -0.0339764729142189, 0.034126173704862595, 0.05502382293343544, 0.020618971437215805, -0.030696270987391472, -0.013595390133559704, -0.02856854349374771, 0.0403427854180336, -0.02784808911383152, -0.004025080241262913, -0.011623065918684006, -0.023647073656320572, -0.021383848041296005, -0.03101491369307041, 0.05079283565282822, -0.04112066701054573, -0.03585713356733322, -0.02295582741498947, 0.0062613822519779205, 0.02795836143195629, 0.45919111371040344, 0.015642572194337845, 0.010819172486662865, -0.018927421420812607, 0.05333065986633301, -0.013381321914494038, -0.06688794493675232, 0.004536801483482122, -0.02615642547607422, -0.024168578907847404, -0.004391009919345379, 0.02736389823257923, 0.02036883682012558, -0.040253669023513794, -0.03790125250816345, 0.007962658070027828, -0.0071958815678954124, 0.049702417105436325, -0.01769251562654972, -0.007585907820612192, -0.012356655672192574, 0.008346965536475182, -0.025942832231521606, 0.030699746683239937, -0.031913790851831436, 0.05911710113286972, -0.05253056809306145, 0.03923376277089119, 0.08774489164352417, 0.038656190037727356, -0.001138997613452375, 0.027105676010251045, 0.027531057596206665, -0.08285865932703018, 0.01835499331355095, -0.00720310490578413, -0.030862700194120407, -0.0021355219651013613, -0.016528071835637093, 0.007597050163894892, 0.02883450873196125, -0.011319754645228386, 0.03828631341457367, -0.007711352314800024, -0.09212921559810638, -0.12980586290359497, 0.153176948428154, 0.011540068313479424, 0.03229355439543724, -0.036570385098457336, -0.05308893322944641, 0.04680844396352768, 0.07722964882850647, -0.019974086433649063, 0.01124553382396698, 0.03856383636593819, 0.031176432967185974, -0.019309746101498604, -0.023881902918219566, -0.020723534747958183, 0.03380262106657028, -0.018595557659864426, -0.030544748529791832, 0.021867908537387848, 0.10692565888166428, 0.01082476507872343, -0.032912179827690125, -0.007422571070492268, 0.044559549540281296, -0.02630840614438057, 0.006297847256064415, -0.005424784496426582, -0.043432921171188354, 0.018201762810349464, 0.016728423535823822, -0.03545304760336876, 0.036799076944589615, -0.00962867308408022, -0.01837957464158535, -0.005215011071413755, 0.006207386497408152, -0.003111738944426179, -0.014922741800546646, -0.07831963896751404, 0.011641087010502815, 0.03601595386862755, 0.01461033709347248, -0.018093876540660858, -0.01139074470847845, 0.004400675185024738, 0.06439895927906036, 0.043229930102825165, -0.05553846061229706, -0.007091710343956947, 0.03263641893863678, -0.01781558059155941, 0.010551278479397297, -0.010003015398979187, -0.005920502822846174, -0.0057300967164337635, -0.06755722314119339, 0.03704983368515968, 0.0756269097328186, 0.03213987126946449, 0.0392606295645237, -0.02188468724489212, 0.005344125907868147, -0.007023293059319258, 0.006461248733103275, 0.05830821394920349, 0.02524745464324951, -0.06875360757112503, 0.0077574849128723145, -0.008623662404716015, 0.017420368269085884, -0.014371651224792004, 0.01098068617284298, 0.015060738660395145, 0.0575602725148201, -0.015285676345229149, 0.055246952921152115, -0.0046562557108700275, -0.0018068264471367002, -0.017236808314919472, -0.3138227164745331, -0.03679979592561722, 0.025411024689674377, 0.04189690202474594, -0.01532608363777399, -0.0593300499022007, 0.007799589075148106, 0.017681816592812538, 0.004527967423200607, 0.014501117169857025, 0.03831024095416069, 0.0643090233206749, -0.06463051587343216, -0.06307888776063919, -0.018941743299365044, -0.014238502830266953, 0.007583525497466326, -0.017396165058016777, -0.032528795301914215, 0.04459691792726517, 0.016582539305090904, 0.002379573881626129, -0.0007301989244297147, -0.01501720491796732, 0.07269556820392609, -0.03588758409023285, 0.1032787337899208, -0.06350622326135635, 0.006171433720737696, 0.015278211794793606, -0.02456526644527912, 0.03500528261065483, -0.03471928462386131, -0.03898708149790764, 0.023004775866866112, -0.028884489089250565, -0.057739198207855225, -0.0035506936255842447, -0.019762180745601654, -0.039818767458200455, -0.020245185121893883, -0.001681070076301694, 0.03693488612771034, -0.08100935816764832, -0.03178160637617111, -0.030177531763911247, -0.020812533795833588, -0.014491626992821693, -0.01819487474858761, -0.00693318247795105, -0.025987470522522926, 0.0203871913254261, 0.03322084620594978, 0.007113815750926733, 0.01175146084278822, 0.011802028864622116, -0.04891163483262062, 0.03362104669213295, -0.07420346140861511, -0.07266935706138611, 0.00424754386767745, -0.05110837519168854, 0.03763461858034134, -0.0029107211157679558, -0.0017669596709311008, -0.009217919781804085, -0.0223727747797966, -0.02276541478931904, -0.005709769204258919, -0.018832406029105186, -0.018558891490101814, 0.09701454639434814, -3.134470171062276e-05, 0.013778149150311947, 0.025061175227165222, 0.06504049152135849, -0.0010301598813384771, -0.0897194966673851, -0.02937312051653862, -0.010941596701741219, 0.0018644781084731221, 0.05590954050421715, 0.026075152680277824, 0.05139530077576637, -0.010917403735220432, 0.00870157778263092, 0.015995193272829056, 0.006445177365094423, 0.04837393760681152, 0.025667952373623848, -0.008727463893592358, -0.029190072789788246, -0.018183276057243347, -0.0621129646897316, 0.020885098725557327, 0.06874728947877884, -0.23344504833221436, -0.02258511632680893, -0.01419254019856453, 0.10320448130369186, -0.005822952836751938, 0.0024899616837501526, 0.023092474788427353, -0.0038407070096582174, -0.013031963258981705, 0.03569323942065239, -0.049997638911008835, 0.029806140810251236, 0.009199211373925209, -0.058769576251506805, -0.03448537364602089, 0.04246056452393532, 0.07039237767457962, -0.04602818936109543, 0.06294197589159012, 0.004651046358048916, -0.001652638427913189, -0.03949356824159622, 0.12305214256048203, -0.02445746771991253, -0.04061185196042061, -0.0042878249660134315, 0.011829865165054798, -0.019710924476385117, 0.028901495039463043, 0.033081069588661194, -0.010725338943302631, -0.02745557762682438, 0.10162729769945145, 0.05656793341040611, 0.016987605020403862, 0.025119241327047348, 0.004949004389345646, -0.061499010771512985, 0.0024393266066908836, 0.01115493569523096, 0.026186874136328697, -0.03403957560658455, 0.015666430816054344, 0.018578195944428444, 0.054081711918115616, 0.009812631644308567, -0.008632104843854904, -0.08555403351783752, 0.00032727871439419687, 0.013853251934051514, -0.04474908858537674, -0.022167515009641647, -0.008224789053201675, 0.01202460192143917, 0.05611058324575424, 0.009806507267057896, 0.03089478611946106, 0.008607840165495872, -0.023285187780857086, -0.045362312346696854, 0.031830571591854095, -0.02046814002096653, 0.01958327554166317, -0.022897496819496155, -0.02404063194990158]}, "text_id_to_ref_doc_id": {"860863cb-ba90-4287-9ff3-de812a7cf04a": "ac226b84-1585-4759-add3-dc5d0af6ef65"}, "metadata_dict": {"860863cb-ba90-4287-9ff3-de812a7cf04a": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17", "_node_type": "TextNode", "document_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "ref_doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65"}}}
db/docstore.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"docstore/metadata": {"ac226b84-1585-4759-add3-dc5d0af6ef65": {"doc_hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf"}, "860863cb-ba90-4287-9ff3-de812a7cf04a": {"doc_hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf", "ref_doc_id": "ac226b84-1585-4759-add3-dc5d0af6ef65"}}, "docstore/data": {"860863cb-ba90-4287-9ff3-de812a7cf04a": {"__data__": {"id_": "860863cb-ba90-4287-9ff3-de812a7cf04a", "embedding": null, "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "ac226b84-1585-4759-add3-dc5d0af6ef65", "node_type": "4", "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}, "hash": "2c629fa1f2e1e85f17b7d012739aea7cba30cdd55f935ed2225710942132eabf", "class_name": "RelatedNodeInfo"}}, "text": "3.DataProblem\nThisdocumentoutlinesthespecificinstructionsforpreparingtheprovideddatabaseofhumanvoice\nrecordingsfortrainingamachinelearningmodelcapableofdistinguishingbetweenauthenticand\nsyntheticvoices.\n1.DataExplorationandAnalysis:\n\uf0fc UtilizetoolssuchasMatplotlibandSeabornforin-depthdataanalysisandvisualization.\n\uf0fc Beginwithacomprehensiveexplorationofthedatabase,understandingcharacteristics,and\nassessingthedistributionofauthenticandsyntheticsamples.\n\uf0fc Identifyandaddressimbalancedsamplesinthedataset.\n2.ImbalanceHandling:\n\uf0fc Enhancemodelperformancebyemployingtechniquessuchasoversamplingorundersampling,\ne.g.,usingSMOTEorImblearn.\n3.DataCleaning:\n\uf0fc Addressvariationsinsamplewavlengthbyfindingthemeanoftotalsamplelengths.\n\uf0fc Utilizepaddingtechniquestostandardizeeachsampletothefixedmeanlength.\n\uf0fc Handlemisclassifiedsampleswithinthedataset.\n4.FeatureEngineering:\n\uf0fc ExtractrelevantacousticfeatureslikeMFCCs,spectrograms,andpitchfromaudiorecordings.\n\uf0fc Experimentwithdifferentfeaturesetstoidentifythemostdiscriminativeones.\n\uf0fc Normalizeandstandardizefeaturesforconsistentscaling,facilitatingmodeltraining.\n5.SpeakerEmbeddings:\n\uf0fc Considerincorporatingspeakerembeddingstocaptureindividualcharacteristics,enhancingthe\nmodel'sabilitytogeneralizeacrossdiversevoices.\n\uf0fc Implementsuitablemethodsforextractingspeakerembeddings,suchaspre-trainedmodelsor\ntrainingonthedataset.\n6.DataSplitting:\n\uf0fc Splitthedataintotraining,validation,andtestsets,ensuringastratifiedsplit.\n\uf0fc Evaluatemodelperformanceonthevalidationset,minimizinglossbeforefinaltestingonthe\ntestsamples.\n7.DataAugmentation:\n\uf0fc Applydataaugmentationtechniquestoincreasemodelrobustnessagainstvariationsin\nrecordingconditions.\n\uf0fc Techniquesmayincluderandompitchshifts,time-stretching,orintroducingbackgroundnoise.\n8.QualityControl:\n\uf0fc Conductarigorousqualitycontrolchecktoidentifyandaddressanomaliesoroutliersinthe\ndataset.\n\uf0fc Verifythatdatapreprocessingstepsdonotintroduceartifactsnegativelyaffectingmodel\nperformance.\nOncethedataispreparedfollowingtheseguidelines,thetransitionintothemodeldevelopment\nphasewillfocusonselectinganappropriatearchitecture,trainingthemodel,andfine-tuningitfor\noptimalperformance.", "start_char_idx": 0, "end_char_idx": 2150, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "class_name": "TextNode"}, "__type__": "1"}}, "docstore/ref_doc_info": {"ac226b84-1585-4759-add3-dc5d0af6ef65": {"node_ids": ["860863cb-ba90-4287-9ff3-de812a7cf04a"], "metadata": {"page_label": "1", "file_name": "saved_pdf.pdf", "file_path": "E:\\llama-index RAG\\data\\saved_pdf.pdf", "file_type": "application/pdf", "file_size": 64903, "creation_date": "2024-04-14", "last_modified_date": "2024-04-17"}}}}
db/graph_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"graph_dict": {}}
db/image__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
db/index_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"index_store/data": {"01282bcc-a355-4256-b245-78ace39871e9": {"__type__": "vector_store", "__data__": "{\"index_id\": \"01282bcc-a355-4256-b245-78ace39871e9\", \"summary\": null, \"nodes_dict\": {\"860863cb-ba90-4287-9ff3-de812a7cf04a\": \"860863cb-ba90-4287-9ff3-de812a7cf04a\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ python-dotenv
3
+ llama-index
4
+ llama-index-embeddings-huggingface
5
+ llama-index-llms-huggingface