File size: 8,435 Bytes
3888ab7
 
 
 
 
 
2278710
3888ab7
 
 
 
 
 
 
 
 
fc2b6d6
3888ab7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc2b6d6
3888ab7
 
0ef2079
fc2b6d6
 
30f9405
819a468
fc2b6d6
8c8ea80
3888ab7
8c8ea80
 
3888ab7
a9ed318
 
 
 
8c8ea80
30f9405
2171b56
 
8c8ea80
3888ab7
1801257
 
3888ab7
 
 
 
 
 
8c8ea80
3888ab7
8c8ea80
 
3888ab7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad3c084
3888ab7
 
 
 
ad3c084
3888ab7
 
 
 
 
 
 
 
eec4853
3888ab7
eec4853
1aa6fb6
eec4853
 
3888ab7
 
 
1aa6fb6
3888ab7
e469266
 
 
 
 
 
3888ab7
 
 
1aa6fb6
 
 
 
3888ab7
1aa6fb6
3888ab7
 
1aa6fb6
3888ab7
1aa6fb6
3888ab7
 
56ab42f
 
 
3888ab7
 
 
 
56ab42f
3888ab7
56ab42f
 
3888ab7
28aa919
3888ab7
 
 
8c8ea80
3888ab7
 
1aa6fb6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import argparse
import glob
import os.path

import gradio as gr

import pickle
import tqdm
import json

import MIDI
from midi_synthesizer import synthesis

in_space = os.getenv("SYSTEM") == "spaces"


def find_midi():
    if disable_channels is not None:
        disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
    else:
        disable_channels = []
    max_token_seq = tokenizer.max_token_seq
    if prompt is None:
        input_tensor = np.full((1, max_token_seq), tokenizer.pad_id, dtype=np.int64)
        input_tensor[0, 0] = tokenizer.bos_id  # bos
    else:
        prompt = prompt[:, :max_token_seq]
        if prompt.shape[-1] < max_token_seq:
            prompt = np.pad(prompt, ((0, 0), (0, max_token_seq - prompt.shape[-1])),
                            mode="constant", constant_values=tokenizer.pad_id)
        input_tensor = prompt
    input_tensor = input_tensor[None, :, :]
    cur_len = input_tensor.shape[1]
    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len, disable=in_space)
    with bar:
        while cur_len < max_len:
            end = False
            hidden = model[0].run(None, {'x': input_tensor})[0][:, -1]
            next_token_seq = np.empty((1, 0), dtype=np.int64)
            event_name = ""
            for i in range(max_token_seq):
                mask = np.zeros(tokenizer.vocab_size, dtype=np.int64)
                if i == 0:
                    mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
                    if disable_patch_change:
                        mask_ids.remove(tokenizer.event_ids["patch_change"])
                    if disable_control_change:
                        mask_ids.remove(tokenizer.event_ids["control_change"])
                    mask[mask_ids] = 1
                else:
                    param_name = tokenizer.events[event_name][i - 1]
                    mask_ids = tokenizer.parameter_ids[param_name]
                    if param_name == "channel":
                        mask_ids = [i for i in mask_ids if i not in disable_channels]
                    mask[mask_ids] = 1
                logits = model[1].run(None, {'x': next_token_seq, "hidden": hidden})[0][:, -1:]
                scores = softmax(logits / temp, -1) * mask
                sample = sample_top_p_k(scores, top_p, top_k)
                if i == 0:
                    next_token_seq = sample
                    eid = sample.item()
                    if eid == tokenizer.eos_id:
                        end = True
                        break
                    event_name = tokenizer.id_events[eid]
                else:
                    next_token_seq = np.concatenate([next_token_seq, sample], axis=1)
                    if len(tokenizer.events[event_name]) == i:
                        break
            if next_token_seq.shape[1] < max_token_seq:
                next_token_seq = np.pad(next_token_seq, ((0, 0), (0, max_token_seq - next_token_seq.shape[-1])),
                                        mode="constant", constant_values=tokenizer.pad_id)
            next_token_seq = next_token_seq[None, :, :]
            input_tensor = np.concatenate([input_tensor, next_token_seq], axis=1)
            cur_len += 1
            bar.update(1)
            yield next_token_seq.reshape(-1)
            if end:
                break


def create_msg(name, data):
    return {"name": name, "data": data}


def run(search_prompt, mid=None):
    mid_seq = []

    if mid == None:

        for m in meta_data:
            mid_seq.extend(m[1][17:])
            mid_seq_ticks = m[1][16][1]
            break
       
    elif mid is not None:
        mid_seq = MIDI.midi2score(mid)

    init_msgs = [create_msg("visualizer_clear", None)]
    for event in mid_seq:
        if event[0] == 'note':
            init_msgs.append(create_msg("visualizer_append", event))
    yield mid_seq, None, None, init_msgs

    for i in range(len(mid_seq)-1):
        if mid_seq[i][0] == 'note':
            yield mid_seq, None, None, [create_msg("visualizer_append", mid_seq[i]), create_msg("progress", [i + 1, len(mid_seq)])]

    with open(f"output.mid", 'wb') as f:
        f.write(MIDI.score2midi([mid_seq_ticks, mid_seq]))
    audio = synthesis(MIDI.score2opus([mid_seq_ticks, mid_seq]), soundfont_path)
    yield mid_seq, "output.mid", (44100, audio), [create_msg("visualizer_end", None)]


def cancel_run(mid_seq):
    if mid_seq is None:
        return None, None

    with open(f"output.mid", 'wb') as f:
        f.write(MIDI.score2midi(mid_seq))
    audio = synthesis(MIDI.score2opus(mid_seq), soundfont_path)
    return "output.mid", (44100, audio), [create_msg("visualizer_end", None)]


def load_javascript(dir="javascript"):
    scripts_list = glob.glob(f"{dir}/*.js")
    javascript = ""
    for path in scripts_list:
        with open(path, "r", encoding="utf8") as jsfile:
            javascript += f"\n<!-- {path} --><script>{jsfile.read()}</script>"
    template_response_ori = gr.routes.templates.TemplateResponse

    def template_response(*args, **kwargs):
        res = template_response_ori(*args, **kwargs)
        res.body = res.body.replace(
            b'</head>', f'{javascript}</head>'.encode("utf8"))
        res.init_headers()
        return res

    gr.routes.templates.TemplateResponse = template_response


class JSMsgReceiver(gr.HTML):
    def __init__(self, **kwargs):
        super().__init__(elem_id="msg_receiver", visible=False, **kwargs)
    
    def postprocess(self, y):
        if y:
            y = f"<p>{json.dumps(y)}</p>"
        return super().postprocess(y)
    
    def get_block_name(self) -> str:
        return "html"

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    parser.add_argument("--port", type=int, default=7860, help="gradio server port")
    parser.add_argument("--max-gen", type=int, default=1024, help="max")
    
    opt = parser.parse_args()
    
    soundfont_path = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2"
    meta_data_path = "meta-data/LAMD_META_10000.pickle"
    
    models_info = {"generic pretrain model": ["skytnt/midi-model", ""],
                   "j-pop finetune model": ["skytnt/midi-model-ft", "jpop/"],
                   "touhou finetune model": ["skytnt/midi-model-ft", "touhou/"]}


    print('Loading meta-data...')
    with open(meta_data_path, 'rb') as f:
        meta_data = pickle.load(f)
    print('Done!')
    
    
    load_javascript()
    app = gr.Blocks()
    with app:
        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>MIDI Search</h1>")
        gr.Markdown("![Visitors](https://api.visitorbadge.io/api/visitors?path=asigalov61.MIDI-Search&style=flat)\n\n"
                    "MIDI Search and Explore\n\n"
                    "Demo for [MIDI Search](https://github.com/asigalov61)\n\n"
                    "[Open In Colab]"
                    "(https://colab.research.google.com/github/asigalov61/MIDI-Search/blob/main/demo.ipynb)"
                    " for faster running and longer generation"
                    )
        
        js_msg = JSMsgReceiver()
        
        with gr.Tabs():
            with gr.TabItem("instrument prompt") as tab1:
                
                search_prompt = gr.Textbox(label="search prompt")
                
            with gr.TabItem("midi prompt") as tab2:
                input_midi = gr.File(label="input midi", file_types=[".midi", ".mid"], type="binary")

        with gr.Accordion("options", open=False):
 
            input_allow_cc = gr.Checkbox(label="allow midi cc event", value=True)
            
        search_btn = gr.Button("search", variant="primary")
        stop_btn = gr.Button("stop and output")
        output_midi_seq = gr.Textbox()
        output_midi_visualizer = gr.HTML(elem_id="midi_visualizer_container")
        output_audio = gr.Audio(label="output audio", format="mp3", elem_id="midi_audio")
        output_midi = gr.File(label="output midi", file_types=[".mid"])
        run_event = search_btn.click(run, [search_prompt],
                                  [output_midi_seq, output_midi, output_audio, js_msg])
        stop_btn.click(cancel_run, output_midi_seq, [output_midi, output_audio, js_msg], cancels=run_event, queue=False)
    app.queue(1).launch(server_port=opt.port, share=opt.share, inbrowser=True)