Scrya commited on
Commit
17c5099
1 Parent(s): 6a6cc36

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .ipynb_checkpoints/
2
+ flagged/
3
+ __pycache__/
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import gradio as gr
4
+ from config import BaseConfig
5
+ from predict import inputs, outputs, predict
6
+
7
+ if __name__ == "__main__":
8
+ logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(message)s")
9
+ config = BaseConfig()
10
+
11
+ app = gr.Interface(
12
+ predict,
13
+ inputs=inputs,
14
+ outputs=outputs,
15
+ title="Text-to-Meow",
16
+ description="Ever thought of whether your cat understands your words? It no longer matters! Now you get to speak in their language!",
17
+ )
18
+
19
+ app.launch(
20
+ server_name="0.0.0.0",
21
+ server_port=config.port,
22
+ enable_queue=True,
23
+ share=True
24
+ )
config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseSettings, Field
2
+
3
+ class BaseConfig(BaseSettings):
4
+ """Define any config here.
5
+ See here for documentation:
6
+ https://pydantic-docs.helpmanual.io/usage/settings/
7
+ """
8
+ # KNative assigns a $PORT environment variable to the container
9
+ port: int = Field(default=8080, env="PORT",description="Gradio App Server Port")
10
+
11
+ manifest_path: str = 'meows/manifest.json'
12
+ sample_rate: int = 16000
13
+ init_factor: float = 0.3
14
+ add_factor: float = 0.2
15
+ power_factor: float = 0.8
16
+
17
+ config = BaseConfig()
meows/data/meow1.wav ADDED
Binary file (43.6 kB). View file
 
meows/data/meow10.wav ADDED
Binary file (21.4 kB). View file
 
meows/data/meow2.wav ADDED
Binary file (54.5 kB). View file
 
meows/data/meow3.wav ADDED
Binary file (36.4 kB). View file
 
meows/data/meow4.wav ADDED
Binary file (46.4 kB). View file
 
meows/data/meow5.wav ADDED
Binary file (32.3 kB). View file
 
meows/data/meow6.wav ADDED
Binary file (47.5 kB). View file
 
meows/data/meow7.wav ADDED
Binary file (27.7 kB). View file
 
meows/data/meow8.wav ADDED
Binary file (51.6 kB). View file
 
meows/data/meow9.wav ADDED
Binary file (18.1 kB). View file
 
meows/manifest.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"audio_filepath": "data/meow1.wav", "text": "meow", "weight": 1.0}
2
+ {"audio_filepath": "data/meow2.wav", "text": "meow", "weight": 1.0}
3
+ {"audio_filepath": "data/meow3.wav", "text": "meow", "weight": 1.0}
4
+ {"audio_filepath": "data/meow4.wav", "text": "meow", "weight": 1.0}
5
+ {"audio_filepath": "data/meow5.wav", "text": "meow", "weight": 1.0}
6
+ {"audio_filepath": "data/meow6.wav", "text": "meow", "weight": 1.0}
7
+ {"audio_filepath": "data/meow7.wav", "text": "meow", "weight": 1.0}
8
+ {"audio_filepath": "data/meow8.wav", "text": "meow", "weight": 1.0}
9
+ {"audio_filepath": "data/meow9.wav", "text": "meow", "weight": 1.0}
10
+ {"audio_filepath": "data/meow10.wav", "text": "meow with bold", "weight": 0.01}
predict.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import librosa
5
+ import numpy as np
6
+ import gradio as gr
7
+ from typing import Any, List, Dict, Tuple
8
+
9
+ from utils import meow_stretch, get_word_lengths
10
+ from config import config, BaseConfig
11
+
12
+ ''' Gradio Input/Output Configurations '''
13
+ inputs: str = 'text'
14
+ outputs: gr.Audio = gr.Audio()
15
+
16
+ def load_meows(cfg: BaseConfig) -> List[Dict[str, Any]]:
17
+
18
+ meow_dir = os.path.dirname(cfg.manifest_path)
19
+
20
+ with open(cfg.manifest_path, mode='r') as fr:
21
+ lines = fr.readlines()
22
+
23
+ items = []
24
+ for line in lines:
25
+ item = json.loads(line)
26
+ item['audio'], item['rate'] = librosa.load(os.path.join(meow_dir, item['audio_filepath']), sr=None)
27
+ items.append(item)
28
+
29
+ return items
30
+
31
+ def extract_meows_weights(items: List[Dict[str, Any]]) -> Tuple[List[np.ndarray], List[float]]:
32
+ meows = [item['audio'] for item in items]
33
+ weights = [item['weight'] for item in items]
34
+ return meows, weights
35
+
36
+ ''' Load meows '''
37
+ meow_items = load_meows(config)
38
+ meows, weights = extract_meows_weights(meow_items)
39
+
40
+ def predict(text: str) -> str:
41
+
42
+ word_lengths = get_word_lengths(text)
43
+ selected_meows = random.choices(meows, weights=weights, k=len(word_lengths))
44
+ transformed_meows = [
45
+ meow_stretch(
46
+ meow, wl,
47
+ init_factor=config.init_factor,
48
+ add_factor=config.add_factor,
49
+ power_factor=config.power_factor
50
+ ) for meow, wl in zip(selected_meows, word_lengths)
51
+ ]
52
+
53
+ result_meows = np.concatenate(transformed_meows, axis=0)
54
+
55
+ return (config.sample_rate, result_meows)
56
+
57
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==3.24.1
2
+ librosa==0.10.0.post2
3
+ numpy==1.23.5
utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import librosa
3
+ import numpy as np
4
+
5
+ from typing import List
6
+
7
+ def stretch(x: np.ndarray, factor: float, nfft: int = 2048) -> np.ndarray:
8
+ '''
9
+ @author: Gagandeep Singh, 29 Oct, 2018
10
+ https://github.com/gaganbahga/time_stretch
11
+
12
+ stretch an audio sequence by a factor using FFT of size nfft converting to frequency domain
13
+ :param x: np.ndarray, audio array in PCM float32 format
14
+ :param factor: float, stretching or shrinking factor, depending on if its > or < 1 respectively
15
+ :return: np.ndarray, time stretched audio
16
+ '''
17
+ stft = librosa.core.stft(x, n_fft=nfft).transpose() # i prefer time-major fashion, so transpose
18
+ stft_rows = stft.shape[0]
19
+ stft_cols = stft.shape[1]
20
+
21
+ times = np.arange(0, stft.shape[0], factor) # times at which new FFT to be calculated
22
+ hop = nfft/4 # frame shift
23
+ stft_new = np.zeros((len(times), stft_cols), dtype=np.complex_)
24
+ phase_adv = (2 * np.pi * hop * np.arange(0, stft_cols))/ nfft
25
+ phase = np.angle(stft[0])
26
+
27
+ stft = np.concatenate( (stft, np.zeros((1, stft_cols))), axis=0)
28
+
29
+ for i, time in enumerate(times):
30
+ left_frame = int(np.floor(time))
31
+ local_frames = stft[[left_frame, left_frame + 1], :]
32
+ right_wt = time - np.floor(time) # weight on right frame out of 2
33
+ local_mag = (1 - right_wt) * np.absolute(local_frames[0, :]) + right_wt * np.absolute(local_frames[1, :])
34
+ local_dphi = np.angle(local_frames[1, :]) - np.angle(local_frames[0, :]) - phase_adv
35
+ local_dphi = local_dphi - 2 * np.pi * np.floor(local_dphi/(2 * np.pi))
36
+ stft_new[i, :] = local_mag * np.exp(phase*1j)
37
+ phase += local_dphi + phase_adv
38
+
39
+ return librosa.core.istft(stft_new.transpose())
40
+
41
+ def meow_stretch(
42
+ x: np.ndarray, character_len: int,
43
+ init_factor: float = 0.3, add_factor: float = 0.2,
44
+ power_factor: float = 0.8, nfft: int = 2048
45
+ ) -> np.ndarray:
46
+ '''
47
+ Stretch the meows based on word length, with a reducing power to prevent incredibly long meows
48
+ '''
49
+
50
+ factor = init_factor + (add_factor * character_len) ** power_factor
51
+ return stretch(x, 1/factor, nfft=nfft)
52
+
53
+ def get_word_lengths(text_input: str) -> List[int]:
54
+ text_input = text_input.translate(str.maketrans('', '', string.punctuation))
55
+ word_list = text_input.split()
56
+
57
+ return [len(word) for word in word_list]