Spaces:

moondream
/

video-redaction

Running on Zero

File size: 7,812 Bytes

#!/usr/bin/env python3
import gradio as gr
import os
from main import load_moondream, process_video
import tempfile
import shutil
import torch
import spaces

# Get absolute path to workspace root
WORKSPACE_ROOT = os.path.dirname(os.path.abspath(__file__))


# Initialize model globally for reuse
print("Loading Moondream model...")
model, tokenizer = load_moondream()

# Uncomment for Hugging Face Spaces
@spaces.GPU(duration=120)
def process_video_file(
    video_file, detect_keyword, box_style, ffmpeg_preset, rows, cols, test_mode
):
    """Process a video file through the Gradio interface."""
    try:
        if not video_file:
            raise gr.Error("Please upload a video file")

        # Ensure input/output directories exist using absolute paths
        inputs_dir = os.path.join(WORKSPACE_ROOT, "inputs")
        outputs_dir = os.path.join(WORKSPACE_ROOT, "outputs")
        os.makedirs(inputs_dir, exist_ok=True)
        os.makedirs(outputs_dir, exist_ok=True)

        # Copy uploaded video to inputs directory
        video_filename = f"input_{os.path.basename(video_file)}"
        input_video_path = os.path.join(inputs_dir, video_filename)
        shutil.copy2(video_file, input_video_path)

        try:
            # Process the video
            output_path = process_video(
                input_video_path,
                detect_keyword,
                test_mode=test_mode,
                ffmpeg_preset=ffmpeg_preset,
                rows=rows,
                cols=cols,
                box_style=box_style,
            )

            # Verify output exists and is readable
            if not output_path or not os.path.exists(output_path):
                print(f"Warning: Output path {output_path} does not exist")
                # Try to find the output based on expected naming convention
                expected_output = os.path.join(
                    outputs_dir, f"{box_style}_{detect_keyword}_{video_filename}"
                )
                if os.path.exists(expected_output):
                    output_path = expected_output
                else:
                    # Try searching in outputs directory for any matching file
                    matching_files = [
                        f
                        for f in os.listdir(outputs_dir)
                        if f.startswith(f"{box_style}_{detect_keyword}_")
                    ]
                    if matching_files:
                        output_path = os.path.join(outputs_dir, matching_files[0])
                    else:
                        raise gr.Error("Failed to locate output video")

            # Convert output path to absolute path if it isn't already
            if not os.path.isabs(output_path):
                output_path = os.path.join(WORKSPACE_ROOT, output_path)

            print(f"Returning output path: {output_path}")
            return output_path

        finally:
            # Clean up input file
            try:
                if os.path.exists(input_video_path):
                    os.remove(input_video_path)
            except:
                pass

    except Exception as e:
        print(f"Error in process_video_file: {str(e)}")
        raise gr.Error(f"Error processing video: {str(e)}")


# Create the Gradio interface
with gr.Blocks(title="Promptable Video Redaction") as app:
    gr.Markdown("# Promptable Video Redaction with Moondream")
    gr.Markdown(
        """
    [Moondream 2B](https://github.com/vikhyat/moondream) is a lightweight vision model that detects and visualizes objects in videos. It can identify objects, people, text and more.

    Upload a video and specify what to detect. The app will process each frame and apply your chosen visualization style. For help, join the [Moondream Discord](https://discord.com/invite/tRUdpjDQfH).
    """
    )

    with gr.Row():
        with gr.Column():
            # Input components
            video_input = gr.Video(label="Upload Video")

            detect_input = gr.Textbox(
                label="What to Detect",
                placeholder="e.g. face, logo, text, person, car, dog, etc.",
                value="face",
                info="Moondream can detect anything that you can describe in natural language",
            )

            gr.Examples(
                examples=[
                    ["examples/homealone.mp4", "face"],
                    ["examples/soccer.mp4", "ball"],
                    ["examples/rally.mp4", "license plate"],
                ],
                inputs=[video_input, detect_input],
                label="Try these examples",
            )

            process_btn = gr.Button("Process Video", variant="primary")

            with gr.Accordion("Advanced Settings", open=False):
                box_style_input = gr.Radio(
                    choices=["censor", "bounding-box", "hitmarker"],
                    value="censor",
                    label="Visualization Style",
                    info="Choose how to display detections",
                )
                preset_input = gr.Dropdown(
                    choices=[
                        "ultrafast",
                        "superfast",
                        "veryfast",
                        "faster",
                        "fast",
                        "medium",
                        "slow",
                        "slower",
                        "veryslow",
                    ],
                    value="medium",
                    label="Processing Speed (faster = lower quality)",
                )
                with gr.Row():
                    rows_input = gr.Slider(
                        minimum=1, maximum=4, value=1, step=1, label="Grid Rows"
                    )
                    cols_input = gr.Slider(
                        minimum=1, maximum=4, value=1, step=1, label="Grid Columns"
                    )

                test_mode_input = gr.Checkbox(
                    label="Test Mode (Process first 3 seconds only)",
                    value=True,
                    info="Enable to quickly test settings on a short clip before processing the full video (recommended)",
                )

                gr.Markdown(
                    """
                Note: Processing in test mode will only process the first 3 seconds of the video and is recommended for testing settings.
                """
                )

                gr.Markdown(
                    """
                We can get a rough estimate of how long the video will take to process by multiplying the videos framerate * seconds * the number of rows and columns and assuming 0.12 seconds processing time per detection.
                For example, a 3 second video at 30fps with 2x2 grid, the estimated time is 3 * 30 * 2 * 2 * 0.12 = 43.2 seconds (tested on a 4090 GPU).
                """
                )

        with gr.Column():
            # Output components
            video_output = gr.Video(label="Processed Video")

            # About section under the video output
            gr.Markdown(
                """
            ### Links:
            - [GitHub Repository](https://github.com/vikhyat/moondream)
            - [Hugging Face](https://huggingface.co./vikhyatk/moondream2)
            - [Python Package](https://pypi.org/project/moondream/)
            - [Moondream Recipes](https://docs.moondream.ai/recipes)
            """
            )

    # Event handlers
    process_btn.click(
        fn=process_video_file,
        inputs=[
            video_input,
            detect_input,
            box_style_input,
            preset_input,
            rows_input,
            cols_input,
            test_mode_input,
        ],
        outputs=video_output,
    )

if __name__ == "__main__":
    app.launch(share=True)