import gradio as gr
from openai import OpenAI
import requests
import json
# from volcenginesdkarkruntime import Ark
import torch
import torchaudio
from einops import rearrange
import argparse
import json
import os
import gc
#import spaces
from tqdm import tqdm
import random
import numpy as np
import sys
from diffrhythm.infer.infer_utils import (
get_reference_latent,
get_lrc_token,
get_style_prompt,
prepare_model,
get_negative_style_prompt
)
from diffrhythm.infer.infer import inference
import devicetorch
device=devicetorch.get(torch)
cfm, tokenizer, muq, vae = prepare_model(device)
#cfm = torch.compile(cfm)
#@spaces.GPU
def infer_music(lrc, ref_audio_path, steps, file_type, max_frames=2048):
sway_sampling_coef = -1 if steps < 32 else None
lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
style_prompt = get_style_prompt(muq, ref_audio_path)
negative_style_prompt = get_negative_style_prompt(device)
latent_prompt = get_reference_latent(device, max_frames)
print(">0")
generated_song = inference(cfm_model=cfm,
vae_model=vae,
cond=latent_prompt,
text=lrc_prompt,
duration=max_frames,
style_prompt=style_prompt,
negative_style_prompt=negative_style_prompt,
steps=steps,
sway_sampling_coef=sway_sampling_coef,
start_time=start_time,
file_type=file_type
)
torch.cuda.empty_cache()
gc.collect()
print(">4")
return generated_song
def R1_infer1(theme, tags_gen, language):
try:
client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
llm_prompt = """
请围绕"{theme}"主题生成一首符合"{tags}"风格的语言为{language}的完整歌词。严格遵循以下要求:
### **强制格式规则**
1. **仅输出时间戳和歌词**,禁止任何括号、旁白、段落标记(如副歌、间奏、尾奏等注释)。
2. 每行格式必须为 `[mm:ss.xx]歌词内容`,时间戳与歌词间无空格,歌词内容需完整连贯。
3. 时间戳需自然分布,**第一句歌词起始时间不得为 [00:00.00]**,需考虑前奏空白。
### **内容与结构要求**
1. 歌词应富有变化,使情绪递进,整体连贯有层次感。**每行歌词长度应自然变化**,切勿长度一致,导致很格式化。
2. **时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测**,而非机械地按照歌词长度分配。
3. 间奏/尾奏仅通过时间空白体现(如从 [02:30.00] 直接跳至 [02:50.00]),**无需文字描述**。
### **负面示例(禁止出现)**
- 错误:[01:30.00](钢琴间奏)
- 错误:[02:00.00][副歌]
- 错误:空行、换行符、注释
"""
response = client.chat.completions.create(
model="ep-20250304144033-nr9wl",
messages=[
{"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."},
{"role": "user", "content": llm_prompt.format(theme=theme, tags=tags_gen, language=language)},
],
stream=False
)
info = response.choices[0].message.content
return info
except requests.exceptions.RequestException as e:
print(f'请求出错: {e}')
return {}
def R1_infer2(tags_lyrics, lyrics_input):
client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
llm_prompt = """
{lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格,我现在想要给这首歌的每一句歌词打时间戳得到LRC,我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测,而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度,避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词,每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
"""
response = client.chat.completions.create(
model="ep-20250304144033-nr9wl",
messages=[
{"role": "system", "content": "You are a professional musician who has been invited to make music-related comments."},
{"role": "user", "content": llm_prompt.format(lyrics_input=lyrics_input, tags_lyrics=tags_lyrics)},
],
stream=False
)
info = response.choices[0].message.content
return info
css = """
/* 固定文本域高度并强制滚动条 */
.lyrics-scroll-box textarea {
height: 300px !important; /* 固定高度 */
max-height: 500px !important; /* 最大高度 */
overflow-y: auto !important; /* 垂直滚动 */
white-space: pre-wrap; /* 保留换行 */
line-height: 1.5; /* 行高优化 */
}
.gr-examples {
background: transparent !important;
border: 1px solid #e0e0e0 !important;
border-radius: 8px;
margin: 1rem 0 !important;
padding: 1rem !important;
}
"""
with gr.Blocks(css=css) as demo:
# gr.Markdown("
DiffRhythm (谛韵)
")
gr.HTML("""
DiffRhythm (谛韵)
""")
with gr.Tabs() as tabs:
# page 1
with gr.Tab("Music Generate", id=0):
with gr.Row():
with gr.Column():
with gr.Accordion("Best Practices Guide", open=False):
gr.Markdown("""
1. **Lyrics Format Requirements**
- Each line must follow: `[mm:ss.xx]Lyric content`
- Example of valid format:
```
[00:10.00]Moonlight spills through broken blinds
[00:13.20]Your shadow dances on the dashboard shrine
```
2. **Generation Duration Limits**
- Current version supports maximum **95 seconds** of music generation
- Total timestamps should not exceed 01:35.00 (95 seconds)
3. **Audio Prompt Requirements**
- Reference audio should be ≥10 seconds for optimal results
- Shorter clips may lead to incoherent generation
""")
lrc = gr.Textbox(
label="Lrc",
placeholder="Input the full lyrics",
lines=12,
max_lines=50,
elem_classes="lyrics-scroll-box",
value="""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""
)
audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./src/prompt/default.wav")
with gr.Column():
lyrics_btn = gr.Button("Submit", variant="primary")
audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
with gr.Accordion("Advanced Settings", open=False):
steps = gr.Slider(
minimum=10,
maximum=100,
value=32,
step=1,
label="Diffusion Steps",
interactive=True,
elem_id="step_slider"
)
file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="wav")
gr.Examples(
examples=[
["./src/prompt/pop_cn.wav"],
["./src/prompt/pop_en.wav"],
["./src/prompt/rock_cn.wav"],
["./src/prompt/rock_en.wav"],
["./src/prompt/country_cn.wav"],
["./src/prompt/country_en.wav"],
["./src/prompt/classic_cn.wav"],
["./src/prompt/classic_en.wav"],
["./src/prompt/jazz_cn.wav"],
["./src/prompt/jazz_en.wav"],
["./src/prompt/default.wav"]
],
inputs=[audio_prompt],
label="Audio Examples",
examples_per_page=11,
elem_id="audio-examples-container"
)
gr.Examples(
examples=[
["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it"""]
],
inputs=[lrc],
label="Lrc Examples",
examples_per_page=2,
elem_id="lrc-examples-container",
)
# page 2
with gr.Tab("LLM Generate LRC", id=1):
with gr.Row():
with gr.Column():
with gr.Accordion("Notice", open=False):
gr.Markdown("**Two Generation Modes:**\n1. Generate from theme & tags\n2. Add timestamps to existing lyrics")
with gr.Group():
gr.Markdown("### Method 1: Generate from Theme")
theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
language = gr.Radio(["zh", "en"], label="Language", value="en")
gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
gr.Examples(
examples=[
[
"Love and Heartbreak",
"vocal emotional piano pop",
"en"
],
[
"Heroic Epic",
"choir orchestral powerful",
"zh"
]
],
inputs=[theme, tags_gen, language],
label="Examples: Generate from Theme"
)
with gr.Group(visible=True):
gr.Markdown("### Method 2: Add Timestamps to Lyrics")
tags_lyrics = gr.Textbox(label="tags", placeholder="Example: female ballad piano slow")
lyrics_input = gr.Textbox(
label="Raw Lyrics (without timestamps)",
placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
lines=10,
max_lines=50,
elem_classes="lyrics-scroll-box"
)
gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
gr.Examples(
examples=[
[
"acoustic folk happy",
"""I'm sitting here in the boring room\nIt's just another rainy Sunday afternoon"""
],
[
"electronic dance energetic",
"""We're living in a material world\nAnd I am a material girl"""
]
],
inputs=[tags_lyrics, lyrics_input],
label="Examples: Generate from Lyrics"
)
with gr.Column():
lrc_output = gr.Textbox(
label="Generated LRC Lyrics",
placeholder="Timed lyrics will appear here",
lines=57,
elem_classes="lrc-output",
show_copy_button=True
)
# Bind functions
gen_from_theme_btn.click(
fn=R1_infer1,
inputs=[theme, tags_gen, language],
outputs=lrc_output
)
gen_from_lyrics_btn.click(
fn=R1_infer2,
inputs=[tags_lyrics, lyrics_input],
outputs=lrc_output
)
tabs.select(
lambda s: None,
None,
None
)
lyrics_btn.click(
fn=infer_music,
inputs=[lrc, audio_prompt, steps, file_type],
outputs=audio_output
)
demo.queue().launch(show_api=False, show_error=True)
if __name__ == "__main__":
demo.launch()