SahilJ2 commited on
Commit
453aed9
·
verified ·
1 Parent(s): 9c6aa3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -238,26 +238,30 @@ def predict_category(que, input_image):
238
  return preds[0]
239
 
240
 
241
- def combine(audio, input_image):
242
- que = transcribe_audio(audio)
243
- # que = "What is the animal here?"
 
 
244
 
245
  image = Image.fromarray(input_image).convert('RGB')
246
  category = predict_category(que, image)
247
-
248
  answer = predict_answer(0, que, image)
249
 
250
- # print(category)
251
-
252
  tts = gTTS(answer)
253
  tts.save('answer.mp3')
254
- return que, answer, 'answer.mp3'
255
-
256
-
257
-
258
- # Define the Gradio interface for recording audio and displaying the transcription
259
- model_interface = gr.Interface(fn=combine, inputs=[gr.Microphone(label="Ask your question"),gr.Image(label="Upload the image")], outputs=[gr.Text(label="Transcribed Question"), gr.Text(label="Answer"), gr.Audio(label="Audio Answer")])
260
- # image_upload_interface = gr.Interface(fn=upload_image, inputs=gr.Image(label="Upload the image"), outputs="text")
 
 
 
 
 
261
 
262
  # Launch the Gradio interface
263
  model_interface.launch(debug=True)
 
238
  return preds[0]
239
 
240
 
241
+ def combine(audio, input_image, text_question=""):
242
+ if audio:
243
+ que = transcribe_audio(audio)
244
+ else:
245
+ que = text_question
246
 
247
  image = Image.fromarray(input_image).convert('RGB')
248
  category = predict_category(que, image)
 
249
  answer = predict_answer(0, que, image)
250
 
 
 
251
  tts = gTTS(answer)
252
  tts.save('answer.mp3')
253
+
254
+ return que, answer, 'answer.mp3', category
255
+
256
+ # Define the Gradio interface for recording audio, text input, and image upload
257
+ model_interface = gr.Interface(fn=combine,
258
+ inputs=[gr.Microphone(label="Ask your question"),
259
+ gr.Image(label="Upload the image"),
260
+ gr.Textbox(label="Text Question")],
261
+ outputs=[gr.Text(label="Transcribed Question"),
262
+ gr.Text(label="Answer"),
263
+ gr.Audio(label="Audio Answer"),
264
+ gr.Text(label="Category")])
265
 
266
  # Launch the Gradio interface
267
  model_interface.launch(debug=True)