Spaces:
Running
Running
import whisper | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from pydub import AudioSegment | |
from hezar.models import Model | |
import librosa | |
import soundfile as sf | |
from audio_separator.separator import Separator | |
from logging import ERROR | |
import streamlit as st | |
def cosine_sim(text1, text2): | |
vectorizer = TfidfVectorizer().fit_transform([text1, text2]) | |
vectors = vectorizer.toarray() | |
return cosine_similarity(vectors)[0, 1] | |
def take_challenge(music_file, typed_lyrics, key, language, has_background=False, background_audio_path=None): | |
st.write("Listen to music since you have to record 15seconds after that") | |
st.audio(music_file) | |
if has_background: | |
st.write("Play this music while singing which might help you:") | |
st.audio(background_audio_path) | |
audio_value = st.experimental_audio_input("Sing Rest of music:ποΈ", key=key) | |
if audio_value: | |
with open("user_sing.mp3", "wb") as f: | |
f.write(audio_value.getbuffer()) | |
if has_background: | |
file_to_transcribe = split_vocals("user_sing.mp3")[1] | |
else: | |
file_to_transcribe = "user_sing.mp3" | |
if language == "en": | |
english_model = whisper.load_model("base.en") | |
user_lyrics = english_model.transcribe(file_to_transcribe, language=language)["text"] | |
else: | |
persian_model = Model.load("hezarai/whisper-small-fa") | |
user_lyrics = persian_model.predict(file_to_transcribe)[0]["text"] | |
st.write(user_lyrics) | |
similarity_score = cosine_sim(typed_lyrics, user_lyrics) | |
if similarity_score > 0.85: | |
st.success('Awsome! You are doing great', icon="β ") | |
st.markdown('<style>div.stAlert { background-color: rgba(3, 67, 24, 0.9); }</style>', unsafe_allow_html=True) | |
else: | |
st.error('Awful! Try harder next time', icon="π¨") | |
st.markdown('<style>div.stAlert { background-color: rgba(241, 36, 36, 0.9); }</style>', unsafe_allow_html=True) | |
def change_volume(input_file, output_file, volume_factor): | |
sound = AudioSegment.from_mp3(input_file) | |
volume_changed = sound + volume_factor | |
volume_changed.export(output_file, format="mp3") | |
def change_speed(input_file, output_file, speed_factor): | |
sound, sr = librosa.load(input_file) | |
speed_changed = librosa.effects.time_stretch(sound, rate=speed_factor) | |
sf.write(output_file, speed_changed, sr) | |
def change_pitch(input_file, output_file, pitch_factor): | |
sound, sr = librosa.load(input_file) | |
pitch_changed = librosa.effects.pitch_shift(sound, sr=sr, n_steps=pitch_factor) | |
sf.write(output_file, pitch_changed, sr) | |
def low_pass_filter(input_file, output_file, cutoff_freq): | |
sound = AudioSegment.from_mp3(input_file) | |
low_filtered_sound = sound.low_pass_filter(cutoff_freq) | |
low_filtered_sound.export(output_file, format="mp3") | |
def high_pass_filter(input_file, output_file, cutoff_freq): | |
sound = AudioSegment.from_mp3(input_file) | |
high_filtered_sound = sound.high_pass_filter(cutoff_freq) | |
high_filtered_sound.export(output_file, format="mp3") | |
def pan_left_right(input_file, output_file, pan_factor): | |
sound = AudioSegment.from_mp3(input_file) | |
pan_sound = sound.pan(pan_factor) | |
pan_sound.export(output_file, format="mp3") | |
def fade_in_ms(input_file, output_file, fade_factor): | |
sound = AudioSegment.from_mp3(input_file) | |
faded_sound = sound.fade_in(fade_factor) | |
faded_sound.export(output_file, format="mp3") | |
def fade_out_ms(input_file, output_file, fade_factor): | |
sound = AudioSegment.from_mp3(input_file) | |
faded_sound = sound.fade_out(fade_factor) | |
faded_sound.export(output_file, format="mp3") | |
def split_vocals(input_file): | |
separator = Separator(output_format="mp3", log_level=ERROR) | |
separator.load_model("MGM_MAIN_v4.pth") | |
result_list = separator.separate(input_file, primary_output_name=input_file[:-4]+"_instruments", secondary_output_name=input_file[:-4]+"_vocals") | |
return result_list |