Khaledmd12 commited on
Commit
eda103d
1 Parent(s): 0ead00f

Upload 3 files

Browse files
Files changed (3) hide show
  1. Mech-chunks (1).pdf +0 -0
  2. app (1).py +228 -0
  3. requirements.txt +11 -0
Mech-chunks (1).pdf ADDED
Binary file (239 kB). View file
 
app (1).py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/14T27f82OgH2BZgVkanyyUKMrM1KBBJjM
8
+ """
9
+
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_core.output_parsers import StrOutputParser
13
+
14
+ import os
15
+ from langchain_community.llms import Together
16
+ os.environ["TOGETHER_API_KEY"] = "d94547f9415b99f85bfaa0de7bc43476f3450985ac6bf1ccab9942448293c00e"
17
+
18
+ import fitz # PyMuPDF
19
+
20
+ def extract_and_split_pdf(pdf_path, split_key="ENDOFTUT"):
21
+ combined_list = []
22
+
23
+ # Open the PDF file
24
+ document = fitz.open(pdf_path)
25
+
26
+ # Extract text from each page
27
+ all_text = ""
28
+ for page_num in range(document.page_count):
29
+ page = document.load_page(page_num)
30
+ all_text += page.get_text()
31
+
32
+ # Split the text by the key
33
+ combined_list = all_text.split(split_key)
34
+
35
+ return combined_list
36
+
37
+ # Example usage
38
+ pdf_path = "Mech-chunks.pdf"
39
+ combined_list = extract_and_split_pdf(pdf_path)
40
+
41
+ #take json file and make each q&a in single cell in the list
42
+ import json
43
+
44
+ # Load the JSON file
45
+ with open('output_data (1).json', 'r') as file:
46
+ data = json.load(file)
47
+
48
+
49
+ for item in data:
50
+ question = item.get("question", "")
51
+ answer = item.get("answer", "")
52
+ combined_list.append(f"Q: {question}?\nA: {answer}\n")
53
+
54
+ from langchain.vectorstores import FAISS
55
+ from langchain.embeddings import HuggingFaceEmbeddings
56
+ from langchain.schema import Document
57
+
58
+ db = FAISS.from_texts(combined_list,
59
+ HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5'))
60
+
61
+ retriever = db.as_retriever(
62
+ search_type="similarity",
63
+ search_kwargs={'k': 4}
64
+ )
65
+
66
+ # Manually test retrieval to verify correct chunk fetching
67
+ query = "what are the tools needed to change Brake Discs on a Car?"
68
+ retrieved_docs = db.similarity_search(query, k=5)
69
+
70
+ # Print out the content of retrieved documents to verify
71
+ for idx, doc in enumerate(retrieved_docs):
72
+ print(f"Document {idx + 1}:", doc.page_content)
73
+
74
+ llm = Together(
75
+
76
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
77
+ max_tokens=512,
78
+ temperature=0.2,
79
+ #top_p=0.7,
80
+ #top_k=50,
81
+ repetition_penalty=1.1,
82
+ )
83
+
84
+ Llama_guard = Together(
85
+
86
+ model="meta-llama/Meta-Llama-Guard-3-8B",
87
+ max_tokens=10,
88
+ temperature=0.2,
89
+ #top_p=0.7,
90
+ #top_k=50,
91
+ repetition_penalty=1.1,
92
+ )
93
+
94
+ guard_temp_ans = """
95
+ <|system|>
96
+ Task: Check if there is unsafe content in conversations according our safety policy with the below categories.
97
+
98
+ <BEGIN UNSAFE CONTENT CATEGORIES>
99
+ 1: Violence and Hate.
100
+ Should not
101
+ - Help people plan or engage in violence.
102
+ - Encourage people to engage in violence.
103
+ - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
104
+ - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
105
+ - Provide information that can be used to promote or engage in violence, hate, or discrimination.
106
+ - Provide guidance or instructions on how to engage in violent or hateful behavior, except if related to 3D and Blender creation for artistic purposes.
107
+
108
+ 2: Sexual Content.
109
+ Should not
110
+ - Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
111
+ - Create or provide explicit depictions of sexual acts or nudity.
112
+ - Provide information that could be used to create sexually explicit content
113
+
114
+ 3: Criminal Planning.
115
+ Should not
116
+ - Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
117
+ - Encourage people to engage in criminal activities or condone criminal behavior.
118
+ - Provide detailed instructions or information that could be used to plan or execute criminal activities, except if related to 3D and Blender creation for artistic purposes.
119
+
120
+ 4: Guns and Illegal Weapons.
121
+ Should not
122
+ - Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
123
+ - Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
124
+ - Provide information that could be used to create or use illegal weapons, except if related to 3D and Blender creation for artistic purposes.
125
+
126
+ 5: Regulated or Controlled Substances.
127
+ Should not
128
+ - Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
129
+ - Assist or encourage people to create such substances when it is illegal to do so.
130
+ - Provide information that could be used to create or use regulated or controlled substances, except if related to 3D and Blender creation for artistic purposes.
131
+
132
+ 6: Self-Harm.
133
+ Should not
134
+ - Encourage people to harm themselves.
135
+ - Romanticize or condone self-harm or suicide.
136
+ - Provide information regarding the methods of suicide or self-harm.
137
+ - Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
138
+ - Provide information that could be used to promote or engage in self-harm, except if related to 3D and Blender creation for artistic purposes.
139
+
140
+
141
+ <END UNSAFE CONTENT CATEGORIES>
142
+
143
+ <BEGIN CONVERSATION>
144
+ {answer}
145
+ <END CONVERSATION>
146
+
147
+ Provide your safety assessment for {answer} in the above conversation:
148
+ - First line must read 'safe' or 'unsafe'.
149
+ - If unsafe, do not include a second line.
150
+ """
151
+
152
+ #Removed extra space before the line below to fix the indentation error
153
+ prompt_ans_guard = ChatPromptTemplate.from_template(guard_temp_ans)
154
+
155
+ guard_chain = (
156
+ prompt_ans_guard
157
+ | Llama_guard
158
+ )
159
+
160
+ def moderate(chat):
161
+ input_ids = Llama_guard.apply_chat_template(chat, return_tensors="pt").to(device)
162
+ output = model.generate(input_ids=input_ids, max_new_tokens=100, pad_token_id=0)
163
+ prompt_len = input_ids.shape[-1]
164
+ return Llama_guard.decode(output[0][prompt_len:], skip_special_tokens=True)
165
+
166
+ # Define the prompt template
167
+ prompt_template = PromptTemplate(
168
+ input_variables=["context", "question", "history"],
169
+ template=("""
170
+ You are a mechanic assistant and your name is MechBot, these human will ask you questions about Cars,
171
+ use Use following piece of context and chat history to answer the question.
172
+ If you don't know the answer, just say you don't know.
173
+ If the question is start with how to, answer with steps and mention the tools if you know it.
174
+
175
+ Chat History: ({history})
176
+
177
+ Context: ({context})
178
+
179
+ Question: {question}
180
+
181
+ Answer:
182
+ """
183
+
184
+ )
185
+ )
186
+
187
+ llm_chain = prompt_template | llm | StrOutputParser()
188
+
189
+ def answer_question(question,gh):
190
+ global counter
191
+ global history
192
+ global reter
193
+ if "unsafe" in guard_chain.invoke({"answer":question}):
194
+ return "I'm sorry, but I can't respond to that question as it may contain inappropriate content."
195
+ reter = ""
196
+ retrieved_docs = db.similarity_search(question, k=2) # Consider reducing 'k' if context is too large
197
+
198
+ for doc in retrieved_docs:
199
+ reter += doc.page_content + "\n"
200
+
201
+ #Truncate history if it's too long
202
+ if len(history) > 3000: # Adjust this value as needed
203
+ history = history[-2000:]
204
+
205
+ formatted_prompt = prompt_template.format(context=reter, history=history, question=question)
206
+ print("Formatted Prompt:")
207
+ print(formatted_prompt)
208
+
209
+ answer = llm_chain.invoke({"context": reter,"history": history, "question": question})
210
+ history += "\n" + "user question: " + question + "\n" + "AI answer: " + answer
211
+ #print(reter)
212
+ counter += 1
213
+ return answer
214
+
215
+ import gradio as gr
216
+ history = ""
217
+ counter = 1
218
+ # Create the Chat interface
219
+ iface = gr.ChatInterface(
220
+ answer_question, # Use the improved answer_question function
221
+ title="Mech-bot: Your Car Mechanic Assistant",
222
+ description="Ask any car mechanic-related questions, and Mech-bot will try its best to assist you.",
223
+ submit_btn="Ask",
224
+ clear_btn="Clear Chat"
225
+ )
226
+
227
+ # Launch the Gradio interface
228
+ iface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ faiss-gpu
2
+ bitsandbytes
3
+ transformers
4
+ langchain
5
+ huggingface_hub
6
+ sentence_transformers
7
+ accelerate
8
+ torch
9
+ langchain_community
10
+ pymupdf
11
+ gradio