File size: 4,639 Bytes
a8cda10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os

import gradio as gr
import soundfile as sf
import torch
from gradio_client import Client
from huggingface_hub import Repository
from pandas import read_csv

from transformers import pipeline


# load the results file from the private repo
USERNAMES_DATASET_ID = "huggingface-course/audio-course-u7-hands-on"
HF_TOKEN = os.environ.get("HF_TOKEN")

usernames_url = os.path.join("https://huggingface.co./datasets", USERNAMES_DATASET_ID)

usernames_repo = Repository(local_dir="usernames", clone_from=usernames_url, use_auth_token=HF_TOKEN)
usernames_repo.git_pull()

CSV_RESULTS_FILE = os.path.join("usernames", "usernames.csv")
all_results = read_csv(CSV_RESULTS_FILE)

# load the LID checkpoint
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)

# define some constants
TITLE = "🤗 Audio Transformers Course: Unit 7 Assessment"
DESCRIPTION = """
Check that you have successfully completed the hands-on exercise for Unit 7 of the 🤗 Audio Transformers Course by submitting your demo to this Space.

As a reminder, you should start with the template Space provided at [`course-demos/speech-to-speech-translation`](https://huggingface.co./spaces/course-demos/speech-to-speech-translation),
and update the Space to translate from any language X to a **non-English** language Y.

Your demo should take as input an audio file, and return as output another audio file, matching the signature of the 
[`speech_to_speech_translation`](https://huggingface.co./spaces/course-demos/speech-to-speech-translation/blob/3946ba6705a6632a63de8672ac52a482ab74b3fc/app.py#L35)
function in the template demo.

To submit your demo for assessment, give the repo id or URL to your demo. For the template demo, this would be `course-demos/speech-to-speech-translation`.

This Space will submit a test file to your demo, and check that the output is non-English audio. If your demo successfully
returns an audio file, and this audio file is classified as being non-English, you will pass the demo and get a green 
tick next to your name! ✅

If you experience any issues with using this checker, [open an issue](https://huggingface.co./spaces/huggingface-course/audio-course-u7-assessment/discussions/new)
on this Space and tag [`@sanchit-gandhi`](https://huggingface.co./sanchit-gandhi).
"""
THRESHOLD = 0.5
PASS_MESSAGE = "Congratulations! Your demo passed the assessment!"


def verify_demo(repo_id):
    if "/" not in repo_id:
        raise gr.Error(f"Ensure you pass a valid repo id to the assessor, got `{repo_id}`")

    split_repo_id = repo_id.split("/")
    user_name = split_repo_id[-2]

    if len(split_repo_id) > 2:
        repo_id = "/".join(split_repo_id[-2:])

    if user_name in all_results["username"]:
        raise gr.Error(f"Username {user_name} has already passed the assessment!")

    try:
        client = Client(repo_id, hf_token=HF_TOKEN)
    except Exception as e:
        raise gr.Error(f"Error with loading Space: {e}")

    try:
        audio_file = client.predict("test.wav", api_name="/predict")
    except Exception as e:
        raise gr.Error(
            f"Error with querying Space, ensure your Space takes an audio file as input and returns an audio as output: {e}"
        )

    audio, sampling_rate = sf.read(audio_file)

    language_prediction = pipe({"array": audio, "sampling_rate": sampling_rate})

    label_outputs = {}
    for pred in language_prediction:
        label_outputs[pred["label"]] = pred["score"]

    top_prediction = language_prediction[0]

    if top_prediction["score"] < THRESHOLD:
        raise gr.Error(
            f"Model made random predictions - predicted {top_prediction['label']} with probability {top_prediction['score']}"
        )
    elif top_prediction["label"] == "eng":
        raise gr.Error(
            "Model generated an English audio - ensure the model is set to generate audio in a non-English langauge, e.g. Dutch"
        )

    # save and upload new evaluated usernames
    all_results.loc[len(all_results)] = {"username": user_name}
    all_results.to_csv(CSV_RESULTS_FILE, index=False)
    usernames_repo.push_to_hub()

    return PASS_MESSAGE, (sampling_rate, audio), label_outputs


demo = gr.Interface(
    fn=verify_demo,
    inputs=gr.Textbox(placeholder="course-demos/speech-to-speech-translation", label="Repo id or URL of your demo"),
    outputs=[
        gr.Textbox(label="Status"),
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Label(label="Language prediction"),
    ],
    title=TITLE,
    description=DESCRIPTION,
)
demo.launch()