import gradio as gr import json import os import torchaudio from infer import ( WatermarkSolver, hamming_distance ) # Predefined watermarks (instead of loading from a JSON file) watermarks = { "VoiceMark": "1000010101010011", "Voice Cloning": "1111111001000010", "Speech Security": "1011101100001110", "Audio Watermarking": "0110110011100010", "Deep Learning": "0000100111111000", "Artificial Intelligence": "0010000100011111", "Hello World": "0001111101110001", "Happy New Year": "1101011011011101", "World Peace": "0011110010011110", "Good Morning": "0000001011000010", } # Initialize WatermarkSolver model solver = WatermarkSolver() solver.load_model(checkpoint_dir="./", checkpoint_name="voicemark.pth", strict=True) # Gradio interface with gr.Blocks() as demo: gr.Markdown( "# VoiceMark: Zero-Shot Voice Cloning-Resistant Watermarking Approach Leveraging Speaker-Specific Latents" ) with gr.Column(): gr.Image( value="voicemark_overview.png", width=925, height=487, elem_id="overview_image", label="overview" ) # Step 1: Upload audio and select watermark gr.HTML("

The overall architecture of our proposed VoiceMark

") # Step 1: Upload audio and select watermark gr.Markdown( """ **Step 1**: Upload an audio file or select one from the provided samples, choose a watermark, and generate the watermarked audio. """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Upload Audio", type="filepath") gr.Examples( examples=[ ["audios/1.wav"], ["audios/2.wav"], ["audios/3.wav"], ["audios/4.wav"], ["audios/5.wav"], ], inputs=audio_input, label="Sample Audios (Click to Use)" ) with gr.Column(): audio_output = gr.Audio(label="Watermarked Audio", type="filepath") watermark_list = gr.Dropdown( label="Select Watermark", choices=list(watermarks.keys()), interactive=True ) add_watermark_button = gr.Button("Add Watermark to Audio") # Step 2: TTS tools demo links gr.Markdown( """ **Step 2**: Download the generated watermarked audio, then use Zero-Shot Voice Cloning tools to generate the cloned audio. Some available tools are: - [CosyVoice2: Scalable Streaming Speech Synthesis with Large Language Models](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B) - [F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching](https://huggingface.co./spaces/mrfakename/E2-F5-TTS) - [MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer](https://huggingface.co./spaces/amphion/maskgct) """ ) # Step 3: Upload cloned audio to decode watermark gr.Markdown( """ **Step 3**: Upload the cloned audio and decode your watermark. """ ) with gr.Row(): decode_audio_input = gr.Audio(label="Upload Cloned Audio", type="filepath") with gr.Column(): decoded_watermark_output = gr.Textbox(label="Decoded Watermark") decode_button = gr.Button("Decode Watermark") def process_audio(audio_path, watermark_text): if not audio_path: return "No audio selected. Please upload or select a sample." try: watermarked_audio = solver.infer_for_ui( audio_path, watermarks[watermark_text] ) return watermarked_audio except ValueError as e: return str(e) add_watermark_button.click( process_audio, inputs=[audio_input, watermark_list], outputs=audio_output ) def decode_watermark(audio_path): try: detect_prob, decoded_id = solver.decode_for_ui(audio_path) if detect_prob < 1e-2: return "No matching watermark found" closest_match = None min_distance = float("inf") for text, id_bin in watermarks.items(): distance = hamming_distance(decoded_id, id_bin, base=16) if distance < min_distance: closest_match = text min_distance = distance if min_distance < 10: return closest_match return "No matching watermark found" except ValueError as e: return str(e) decode_button.click( decode_watermark, inputs=decode_audio_input, outputs=decoded_watermark_output ) # Launch the Gradio app demo.launch()