import streamlit as st from transformers import pipeline from transformers import T5Config from datasets import load_dataset import torch embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = embeddings_dataset[7306]["xvector"] speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0) # 加载 Visual Question Answering 模型 microsoft/git-base-vqav2 vqa_pipeline = pipeline("text2text-generation", model="microsoft/git-base-vqav2") # 加载文本到语音模型 text_to_speech_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts") def main(): st.title("Visual Question Answering with Text-to-Speech") image_path = st.text_input("Enter image path:") question = st.text_input("Enter your question:") if st.button("Get Answer"): answer = vqa_pipeline(question, image_path)[0]['generated_text'] # 将说话者的嵌入向量作为文本的一部分传递给文本到语音模型 text_with_speaker = f"{answer} Speaker Embeddings: {speaker_embeddings}" audio_data = text_to_speech_pipeline(text_with_speaker) st.write("Answer:", answer) st.audio(audio_data[0]["audio"], format='audio/wav') if __name__ == '__main__': main()