import gradio as gr from openai import OpenAI import requests import json # from volcenginesdkarkruntime import Ark import torch import torchaudio from einops import rearrange import argparse import json import os #import spaces from tqdm import tqdm import random import numpy as np import sys from diffrhythm.infer.infer_utils import ( get_reference_latent, get_lrc_token, get_style_prompt, prepare_model, get_negative_style_prompt ) from diffrhythm.infer.infer import inference import devicetorch device=devicetorch.get(torch) cfm, tokenizer, muq, vae = prepare_model(device) #cfm = torch.compile(cfm) #@spaces.GPU def infer_music(lrc, ref_audio_path, steps, file_type, max_frames=2048): sway_sampling_coef = -1 if steps < 32 else None lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device) style_prompt = get_style_prompt(muq, ref_audio_path) negative_style_prompt = get_negative_style_prompt(device) latent_prompt = get_reference_latent(device, max_frames) generated_song = inference(cfm_model=cfm, vae_model=vae, cond=latent_prompt, text=lrc_prompt, duration=max_frames, style_prompt=style_prompt, negative_style_prompt=negative_style_prompt, steps=steps, sway_sampling_coef=sway_sampling_coef, start_time=start_time, file_type=file_type ) return generated_song def R1_infer1(theme, tags_gen, language): try: client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3") llm_prompt = """ 请围绕"{theme}"主题生成一首符合"{tags}"风格的语言为{language}的完整歌词。严格遵循以下要求: ### **强制格式规则** 1. **仅输出时间戳和歌词**,禁止任何括号、旁白、段落标记(如副歌、间奏、尾奏等注释)。 2. 每行格式必须为 `[mm:ss.xx]歌词内容`,时间戳与歌词间无空格,歌词内容需完整连贯。 3. 时间戳需自然分布,**第一句歌词起始时间不得为 [00:00.00]**,需考虑前奏空白。 ### **内容与结构要求** 1. 歌词应富有变化,使情绪递进,整体连贯有层次感。**每行歌词长度应自然变化**,切勿长度一致,导致很格式化。 2. **时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测**,而非机械地按照歌词长度分配。 3. 间奏/尾奏仅通过时间空白体现(如从 [02:30.00] 直接跳至 [02:50.00]),**无需文字描述**。 ### **负面示例(禁止出现)** - 错误:[01:30.00](钢琴间奏) - 错误:[02:00.00][副歌] - 错误:空行、换行符、注释 """ response = client.chat.completions.create( model="ep-20250304144033-nr9wl", messages=[ {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."}, {"role": "user", "content": llm_prompt.format(theme=theme, tags=tags_gen, language=language)}, ], stream=False ) info = response.choices[0].message.content return info except requests.exceptions.RequestException as e: print(f'请求出错: {e}') return {} def R1_infer2(tags_lyrics, lyrics_input): client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3") llm_prompt = """ {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格,我现在想要给这首歌的每一句歌词打时间戳得到LRC,我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测,而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度,避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词,每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。 """ response = client.chat.completions.create( model="ep-20250304144033-nr9wl", messages=[ {"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."}, {"role": "user", "content": llm_prompt.format(lyrics_input=lyrics_input, tags_lyrics=tags_lyrics)}, ], stream=False ) info = response.choices[0].message.content return info css = """ /* 固定文本域高度并强制滚动条 */ .lyrics-scroll-box textarea { height: 300px !important; /* 固定高度 */ max-height: 500px !important; /* 最大高度 */ overflow-y: auto !important; /* 垂直滚动 */ white-space: pre-wrap; /* 保留换行 */ line-height: 1.5; /* 行高优化 */ } .gr-examples { background: transparent !important; border: 1px solid #e0e0e0 !important; border-radius: 8px; margin: 1rem 0 !important; padding: 1rem !important; } """ with gr.Blocks(css=css) as demo: # gr.Markdown("

DiffRhythm (谛韵)

") gr.HTML("""
DiffRhythm (谛韵)
""") with gr.Tabs() as tabs: # page 1 with gr.Tab("Music Generate", id=0): with gr.Row(): with gr.Column(): with gr.Accordion("Best Practices Guide", open=False): gr.Markdown(""" 1. **Lyrics Format Requirements** - Each line must follow: `[mm:ss.xx]Lyric content` - Example of valid format: ``` [00:10.00]Moonlight spills through broken blinds [00:13.20]Your shadow dances on the dashboard shrine ``` 2. **Generation Duration Limits** - Current version supports maximum **95 seconds** of music generation - Total timestamps should not exceed 01:35.00 (95 seconds) 3. **Audio Prompt Requirements** - Reference audio should be ≥10 seconds for optimal results - Shorter clips may lead to incoherent generation """) lrc = gr.Textbox( label="Lrc", placeholder="Input the full lyrics", lines=12, max_lines=50, elem_classes="lyrics-scroll-box", value="""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies""" ) audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./src/prompt/default.wav") with gr.Column(): lyrics_btn = gr.Button("Submit", variant="primary") audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output") with gr.Accordion("Advanced Settings", open=False): steps = gr.Slider( minimum=10, maximum=100, value=32, step=1, label="Diffusion Steps", interactive=True, elem_id="step_slider" ) file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="wav") gr.Examples( examples=[ ["./src/prompt/pop_cn.wav"], ["./src/prompt/pop_en.wav"], ["./src/prompt/rock_cn.wav"], ["./src/prompt/rock_en.wav"], ["./src/prompt/country_cn.wav"], ["./src/prompt/country_en.wav"], ["./src/prompt/classic_cn.wav"], ["./src/prompt/classic_en.wav"], ["./src/prompt/jazz_cn.wav"], ["./src/prompt/jazz_en.wav"], ["./src/prompt/default.wav"] ], inputs=[audio_prompt], label="Audio Examples", examples_per_page=11, elem_id="audio-examples-container" ) gr.Examples( examples=[ ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""], ["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it"""] ], inputs=[lrc], label="Lrc Examples", examples_per_page=2, elem_id="lrc-examples-container", ) # page 2 with gr.Tab("LLM Generate LRC", id=1): with gr.Row(): with gr.Column(): with gr.Accordion("Notice", open=False): gr.Markdown("**Two Generation Modes:**\n1. Generate from theme & tags\n2. Add timestamps to existing lyrics") with gr.Group(): gr.Markdown("### Method 1: Generate from Theme") theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak") tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing") language = gr.Radio(["zh", "en"], label="Language", value="en") gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary") gr.Examples( examples=[ [ "Love and Heartbreak", "vocal emotional piano pop", "en" ], [ "Heroic Epic", "choir orchestral powerful", "zh" ] ], inputs=[theme, tags_gen, language], label="Examples: Generate from Theme" ) with gr.Group(visible=True): gr.Markdown("### Method 2: Add Timestamps to Lyrics") tags_lyrics = gr.Textbox(label="tags", placeholder="Example: female ballad piano slow") lyrics_input = gr.Textbox( label="Raw Lyrics (without timestamps)", placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...", lines=10, max_lines=50, elem_classes="lyrics-scroll-box" ) gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary") gr.Examples( examples=[ [ "acoustic folk happy", """I'm sitting here in the boring room\nIt's just another rainy Sunday afternoon""" ], [ "electronic dance energetic", """We're living in a material world\nAnd I am a material girl""" ] ], inputs=[tags_lyrics, lyrics_input], label="Examples: Generate from Lyrics" ) with gr.Column(): lrc_output = gr.Textbox( label="Generated LRC Lyrics", placeholder="Timed lyrics will appear here", lines=57, elem_classes="lrc-output", show_copy_button=True ) # Bind functions gen_from_theme_btn.click( fn=R1_infer1, inputs=[theme, tags_gen, language], outputs=lrc_output ) gen_from_lyrics_btn.click( fn=R1_infer2, inputs=[tags_lyrics, lyrics_input], outputs=lrc_output ) tabs.select( lambda s: None, None, None ) lyrics_btn.click( fn=infer_music, inputs=[lrc, audio_prompt, steps, file_type], outputs=audio_output ) demo.queue().launch(show_api=False, show_error=True) if __name__ == "__main__": demo.launch()