import os import gradio as gr from zhconv import convert from LLM import LLM from ASR import WhisperASR from TFG import SadTalker from TTS import EdgeTTS from src.cost_time import calculate_time from configs import * description = """

Linly 智能对话系统 (Linly-Talker)
[知乎] [bilibili] [GitHub] [个人主页]
Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。

""" use_ref_video = False ref_video = None ref_info = 'pose' use_idle_mode = False length_of_audio = 5 @calculate_time def Asr(audio): try: question = asr.transcribe(audio) question = convert(question, 'zh-cn') except Exception as e: print("ASR Error: ", e) question = 'Gradio存在一些bug,麦克风模式有时候可能音频还未传入,请重新点击一下语音识别即可' gr.Warning(question) return question @calculate_time def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0): #answer = llm.predict(question) answer = llm.generate(question) print(answer) try: tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt') except: os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav') return 'answer.wav', 'answer.vtt', answer @calculate_time def Talker_response(text, voice, rate, volume, pitch, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, blink_every, fps): voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice driven_audio, driven_vtt, _ = LLM_response(text, voice, rate, volume, pitch) video = talker.test2(source_image, driven_audio, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every, fps=fps) if driven_vtt: return video, driven_vtt else: return video def main(): with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference: gr.HTML(description) with gr.Row(equal_height=False): with gr.Column(variant='panel'): with gr.Tabs(elem_id="sadtalker_source_image"): with gr.TabItem('Source image'): with gr.Row(): source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image", width=512) with gr.Tabs(elem_id="question_audio"): with gr.TabItem('对话'): with gr.Column(variant='panel'): question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话') input_text = gr.Textbox(label="Input Text", lines=3, info = '文字对话') with gr.Accordion("Advanced Settings", open=False, visible=True) as parameter_article: voice = gr.Dropdown(tts.SUPPORTED_VOICE, value='zh-CN-XiaoxiaoNeural', label="Voice") rate = gr.Slider(minimum=-100, maximum=100, value=0, step=1.0, label='Rate') volume = gr.Slider(minimum=0, maximum=100, value=100, step=1, label='Volume') pitch = gr.Slider(minimum=-100, maximum=100, value=0, step=1, label='Pitch') asr_text = gr.Button('语音识别(语音对话后点击)') asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text]) # with gr.Tabs(elem_id="response_audio"): # with gr.TabItem("语音选择"): # with gr.Column(variant='panel'): # voice = gr.Dropdown(VOICES, values='zh-CN-XiaoxiaoNeural') with gr.Tabs(elem_id="text_examples"): gr.Markdown("## Text Examples") examples = [ ['应对压力最有效的方法是什么?'], ['如何进行时间管理?'], ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?'], ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?'], ['三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?'], ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。'], ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.'], ] gr.Examples( examples = examples, inputs = [input_text], ) # driven_audio = 'answer.wav' with gr.Column(variant='panel'): with gr.Tabs(elem_id="sadtalker_checkbox"): with gr.TabItem('Settings'): with gr.Accordion("Advanced Settings", open=False): gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials") with gr.Column(variant='panel'): # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width with gr.Row(): pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) # exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # blink_every = gr.Checkbox(label="use eye blink", value=True) with gr.Row(): size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") # preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?") with gr.Row(): is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)") facerender = gr.Radio(['facevid2vid', 'PIRender'], value='facevid2vid', label='facerender', info="which face render?") with gr.Row(): batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1) fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20) enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)") with gr.Tabs(elem_id="sadtalker_genearted"): gen_video = gr.Video(label="Generated video", format="mp4",scale=0.8) submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary') submit.click( fn=Talker_response, inputs=[input_text, voice, rate, volume, pitch, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, blink_every, fps], outputs=[gen_video] ) with gr.Row(): examples = [ [ 'examples/source_image/full_body_2.png', 'crop', False, False ], [ 'examples/source_image/full_body_1.png', 'crop', False, False ], [ 'examples/source_image/full3.png', 'crop', False, False ], [ 'examples/source_image/full4.jpeg', 'crop', False, False ], [ 'examples/source_image/art_13.png', 'crop', False, False ], [ 'examples/source_image/art_5.png', 'crop', False, False ], ] gr.Examples(examples=examples, fn=Talker_response, inputs=[ source_image, preprocess_type, is_still_mode, enhancer], outputs=[gen_video], # cache_examples=True, ) return inference if __name__ == "__main__": # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf') # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key") # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat') llm = LLM(mode=mode).init_model('Qwen', 'Qwen/Qwen-1_8B-Chat') talker = SadTalker(lazy_load=True) asr = WhisperASR('base') tts = EdgeTTS() gr.close_all() demo = main() demo.queue() # demo.launch() demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0" server_port=port, # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话 ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_verify=False, debug=True)