import os, sys, re, json import argparse import shutil import warnings import whisper_timestamped as wt from pdb import set_trace as b from pprint import pprint as pp from profanity_check import predict, predict_prob from pydub import AudioSegment from pydub.playback import play from subprocess import Popen, PIPE def parse_args(): """ """ parser = argparse.ArgumentParser( description=('Tool to mute profanities in a song (source separation -> speech recognition -> profanity detection -> mask profanities -> re-mix)'), usage=('see or run as local web app with streamlit: ') ) parser.add_argument( '-i', '--input', default=None, nargs='?', #required=True, help=("path to a mp3") ) parser.add_argument( '-m', '--model', default='small', nargs='?', help=("model used by whisper for speech recognition: tiny, small (default) or medium") ) parser.add_argument( '-p', '--play', default=False, action='store_true', help=("play output audio at the end") ) parser.add_argument( '-v', '--verbose', default=True, action='store_true', help=("print transcribed text and detected profanities to screen") ) return parser.parse_args() def main(args, input_file=None, model_size=None, verbose=False, play_output=False, skip_ss=False): """ """ if not input_file: input_file = args.input if not model_size: model_size = args.model if not verbose: verbose = args.verbose if not play_output: play_output = args.play # exit if input file not found if len(sys.argv)>1 and not os.path.isfile(input_file): print('Error: --input file not found') raise Exception print(f'\nProcessing input file: {input_file}') if not skip_ss: # split audio into vocals + accompaniment print('Running source separation') stems_dir = source_separation(input_file, use_demucs=False, use_spleeter=True) vocal_stem = os.path.join(stems_dir, 'vocals.wav') #instr_stem = os.path.join(stems_dir, 'no_vocals.wav') # demucs instr_stem = os.path.join(stems_dir, 'accompaniment.wav') # spleeter print(f'Vocal stem written to: {vocal_stem}') else: vocal_stem = input_file instr_stem = None audio = wt.load_audio(vocal_stem) model = wt.load_model(model_size, device='cpu') text = wt.transcribe(model, audio, language='en') if verbose: print('\nTranscribed text:') print(text['text']+'\n') # checking for profanities in text print('Run profanity detection on text') profanities = profanity_detection(text) if not profanities: print(f'No profanities found in {input_file} - exiting') return 'No profanities found', None, None if verbose: print('profanities found in text:') pp(profanities) # masking print('Mask profanities in vocal stem') vocals = mask_profanities(vocal_stem, profanities) # re-mixing print('Merge instrumentals stem and masked vocals stem') if not skip_ss: mix = AudioSegment.from_wav(instr_stem).overlay(vocals) else: mix = vocals # write mix to file outpath = input_file.replace('.mp3', '_masked.mp3').replace('.wav', '_masked.wav') if input_file.endswith('.wav'): mix.export(outpath, format="wav") elif input_file.endswith('.mp3'): mix.export(outpath, format="mp3") print(f'Mixed file written to: {outpath}') # play output if play_output: print('\nPlaying output...') play(mix) return outpath, vocal_stem, instr_stem def source_separation(inpath, use_demucs=False, use_spleeter=True): """ Execute shell command to run demucs and pipe stdout/stderr back to python """ infile = os.path.basename(inpath) if use_demucs: cmd = f'demucs --two-stems=vocals --jobs 8 "{inpath}"' #stems_dir = os.path.join(re.findall('/.*', stdout)[0], infile.replace('.mp3','').replace('.wav','')) elif use_spleeter: outdir = 'audio/separated' cmd = f'spleeter separate {inpath} -p spleeter:2stems -o {outdir}' stems_dir = os.path.join(outdir, os.path.splitext(infile)[0]) stdout, stderr = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash').communicate() stdout = stdout.decode('utf8') # exit if lib error'd out if stderr: stderr = stderr.decode('utf-8').lower() if 'error' in stderr or 'not exist' in stderr: print(stderr.decode('utf8').split('\n')[0]) raise Exception # parse stems directory path from stdout and return it if successful if not os.path.isdir(stems_dir): print(f'Error: output stem directory "{stems_dir}" not found') raise Exception return stems_dir def profanity_detection(text): """ """ # detect profanities in text profs = [] for segment in text['segments']: for word in segment['words']: #if word['confidence']<.25: # print(word) text = word['text'].replace('.','').replace(',','').lower() # skip false positives if text in ['cancer','hell','junk','die','lame','freak','freaky','white','stink','shut','spit','mouth','orders','eat','clouds','ugly','dirty','wet']: continue # assume anything returned by whisper with more than 1 * is profanity e.g n***a if '**' in text: profs.append(word) continue # add true negatives if text in ['bitchy', 'puss']: profs.append(word) continue # run profanity detection - returns 1 (True) or 0 (False) if predict([word['text']])[0]: profs.append(word) return profs def mask_profanities(vocal_stem, profanities): """ """ # load vocal stem and mask profanities vocals = AudioSegment.from_wav(vocal_stem) for prof in profanities: mask = vocals[prof['start']*1000:prof['end']*1000] # pydub works in milliseconds mask -= 50 # reduce lvl by some dB (enough to ~mute it) #mask = mask.silent(len(mask)) #mask = mask.fade_in(100).fade_out(100) # it prepends/appends fades so end up with longer mask start = vocals[:prof['start']*1000] end = vocals[prof['end']*1000:] #print(f"masking {prof['text']} from {prof['start']} to {prof['end']}") vocals = start + mask + end return vocals if __name__ == "__main__": args = parse_args() if len(sys.argv)>1: main(args, skip_ss=False) else: import streamlit as st st.title('Saylss') with st.expander("About", expanded=False): st.markdown(''' This app processes an input audio track (.mp3 or .wav) with the purpose of identifying and muting profanities in the song. A larger model takes longer to run and is more accurate, and vice-versa. Simply select the model size and upload your file! ''') model = st.selectbox('Choose model size:', ('tiny','small','medium'), index=1) uploaded_file = st.file_uploader( "Choose input track:", type=[".mp3",".wav"], accept_multiple_files=False, ) if uploaded_file is not None: uploaded_file.name = uploaded_file.name.replace(' ','_') ext = os.path.splitext(uploaded_file.name)[1] if ext == '.wav': st_format = 'audio/wav' elif ext == '.mp3': st_format = 'audio/mp3' uploaded_file_content = uploaded_file.getvalue() with open(uploaded_file.name, 'wb') as f: f.write(uploaded_file_content) audio_bytes_input = uploaded_file_content st.audio(audio_bytes_input, format=st_format) # run code with st.spinner('Processing input audio...'): inpath = os.path.abspath(uploaded_file.name) outpath, vocal_stem, instr_stem = main(args, input_file=inpath, model_size=model) if outpath == 'No profanities found': st.text(outpath + ' - Refresh the page and try a different song or model size') sys.exit() # display output audio #st.text('Play output Track:') st.text('\nOutput:') audio_file = open(outpath, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format=st_format) # flush all media if os.path.isfile(inpath): os.remove(inpath) if os.path.isfile(outpath): os.remove(outpath) if os.path.isfile(vocal_stem): os.remove(vocal_stem) if os.path.isfile(instr_stem): os.remove(instr_stem) sep_dir = os.path.split(instr_stem)[0] if os.path.isdir(sep_dir): os.rmdir(sep_dir)