Spaces:

stepfun-ai
/

Step-Audio

Running

App Files Files Community

martin commited on 1 day ago

Commit

ccdff04

1 Parent(s): 51a6224

tts use api

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -6
Dockerfile +0 -46
app.py +24 -32
cosyvoice/__init__.py +0 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +0 -68
cosyvoice/cli/frontend.py +0 -106
cosyvoice/cli/model.py +0 -32
cosyvoice/flow/decoder.py +0 -238
cosyvoice/flow/flow.py +0 -196
cosyvoice/flow/flow_matching.py +0 -315
cosyvoice/flow/length_regulator.py +0 -65
cosyvoice/hifigan/f0_predictor.py +0 -55
cosyvoice/hifigan/generator.py +0 -566
cosyvoice/matcha/audio.py +0 -90
cosyvoice/matcha/decoder.py +0 -511
cosyvoice/matcha/flow_matching.py +0 -141
cosyvoice/matcha/transformer.py +0 -443
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +0 -87
cosyvoice/transformer/attention.py +0 -322
cosyvoice/transformer/convolution.py +0 -147
cosyvoice/transformer/decoder.py +0 -418
cosyvoice/transformer/decoder_layer.py +0 -132
cosyvoice/transformer/embedding.py +0 -293
cosyvoice/transformer/encoder.py +0 -633
cosyvoice/transformer/encoder_layer.py +0 -237
cosyvoice/transformer/label_smoothing_loss.py +0 -98
cosyvoice/transformer/positionwise_feed_forward.py +0 -116
cosyvoice/transformer/subsampling.py +0 -391
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/audio.py +0 -90
cosyvoice/utils/class_utils.py +0 -78
cosyvoice/utils/common.py +0 -169
cosyvoice/utils/executor.py +0 -151
cosyvoice/utils/file_utils.py +0 -49
cosyvoice/utils/frontend_utils.py +0 -142
cosyvoice/utils/mask.py +0 -226
cosyvoice/utils/scheduler.py +0 -761
cosyvoice/utils/train_utils.py +0 -350
funasr_detach/__init__.py +0 -38
funasr_detach/auto/__init__.py +0 -0
funasr_detach/auto/auto_frontend.py +0 -90
funasr_detach/auto/auto_model.py +0 -573
funasr_detach/auto/auto_tokenizer.py +0 -7
funasr_detach/bin/__init__.py +0 -0
funasr_detach/bin/compute_audio_cmvn.py +0 -152
funasr_detach/bin/inference.py +0 -33
funasr_detach/bin/tokenize_text.py +0 -281
funasr_detach/bin/train.py +0 -227

.gitattributes CHANGED Viewed

@@ -2,11 +2,5 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 assets/user.png filter=lfs diff=lfs merge=lfs -text
 assets/assistant.png filter=lfs diff=lfs merge=lfs -text
-speakers/闫雨婷_prompt.wav filter=lfs diff=lfs merge=lfs -text
-speakers/闫雨婷RAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
-speakers/闫雨婷VOCAL_prompt.wav filter=lfs diff=lfs merge=lfs -text
-speakers/Tingting_prompt.wav filter=lfs diff=lfs merge=lfs -text
-speakers/TingtingRAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
-speakers/TingtingVOCAL_prompt.wav filter=lfs diff=lfs merge=lfs -text
 assets/yuewen.jpeg filter=lfs diff=lfs merge=lfs -text
 assets/request_rap_zh.wav filter=lfs diff=lfs merge=lfs -text

 *.wav filter=lfs diff=lfs merge=lfs -text
 assets/user.png filter=lfs diff=lfs merge=lfs -text
 assets/assistant.png filter=lfs diff=lfs merge=lfs -text
 assets/yuewen.jpeg filter=lfs diff=lfs merge=lfs -text
 assets/request_rap_zh.wav filter=lfs diff=lfs merge=lfs -text

Dockerfile DELETED Viewed

@@ -1,46 +0,0 @@
-FROM nvidia/cuda:12.1.0-base-ubuntu20.04
-ENV TZ=Asia/Shanghai
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
-    && echo $TZ > /etc/timezone
-RUN apt-get update \
-    && apt-get install -y build-essential \
-    && apt-get install -y wget \
-    && apt-get install -y software-properties-common curl zip unzip git-lfs awscli libssl-dev openssh-server vim \
-    && apt-get install -y net-tools iputils-ping iproute2
-RUN apt-get install --reinstall ca-certificates && update-ca-certificates
-RUN add-apt-repository -y 'ppa:deadsnakes/ppa' && apt update
-RUN apt install python3.10 python3.10-dev python3.10-distutils python3.10-venv -y \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-RUN wget -qO- https://bootstrap.pypa.io/get-pip.py | python3.10
-RUN ln -s /usr/bin/python3.10 /usr/bin/python
-RUN pip uninstall -y Pillow && pip install pillow
-# https://huggingface.co/docs/hub/spaces-sdks-docker#permissions
-RUN useradd -m -u 1000 user
-USER user
-ENV HOME="/home/user" \
-    PATH="/home/user/.local/bin:${PATH}"
-RUN python3.10 -m pip install pipx
-RUN pipx install poetry
-RUN poetry --version || { echo 'Poetry installation check failed' ; exit 1; }
-WORKDIR /workspace
-COPY --chown=user requirements.txt .
-RUN pip install -r requirements.txt
-COPY --chown=user . .
-RUN pip install gradio
-RUN pip install openai
-RUN chmod +x start_app.sh
-CMD ["./start_app.sh", "/tmp/hf_model"]

app.py CHANGED Viewed

@@ -4,15 +4,13 @@ import gradio as gr
 import time
 from pathlib import Path
-from tokenizer import StepAudioTokenizer
-from tts import StepAudioTTS
-from yuewen_api import call_audiochat, call_asr
 CACHE_DIR = "/tmp/gradio/"
-CACHE_CLEAN_AGE = 864000
 CHINESE_PROMPT_CONTENT = """你是一个为对话而设计的人工智能模型，目前无法连接到互联网。
-当你需要唱歌或说唱时，请以（RAP）开头。当你需要快速说话时，请以（快速）开头。当你需要慢速说话时，请以（慢速）开头。
 现在，你需要倾听用户的语音内容，并以礼貌、简洁、口语化的文本进行回复。你需要尽量用户的语种进行回复。"""
 ENGLISH_PROMPT_CONTENT = """You are an AI designed for conversation, currently unable to connect to the internet.
@@ -89,20 +87,15 @@ def add_message(chatbot, history, mic, text):
     return chatbot, history, None
-def save_tmp_audio(audio, sr):
     import tempfile
-    import torchaudio
-    with tempfile.NamedTemporaryFile(
-        dir=CACHE_DIR, delete=False, suffix=".wav"
-    ) as temp_audio:
-        temp_audio_path = temp_audio.name
-        torchaudio.save(temp_audio_path, audio, sr)
     return temp_audio.name
-def predict(chatbot, history, tts_model, user_prompt, enable_asr):
     """Generate a response from the model."""
     start_time = time.time()
     try:
@@ -126,8 +119,8 @@ def predict(chatbot, history, tts_model, user_prompt, enable_asr):
         text = call_audiochat(messages)
         print(f"predict {text=}")
-        audio, sr = tts_model(text, "Tingting")
-        audio_path = save_tmp_audio(audio, sr)
         print(f"save_tmp_audio {audio_path=}")
         chatbot.append({"role": "assistant", "content": text})
         chatbot.append({"role": "assistant", "content": {"path": audio_path}})
@@ -142,17 +135,15 @@ def predict(chatbot, history, tts_model, user_prompt, enable_asr):
     return chatbot, history
-def _launch_demo(args, tts_model):
-    with gr.Blocks(delete_cache=(86400, CACHE_CLEAN_AGE)) as demo:
         # 保存 chat 历史，不需要每次再重新拼格式
         history = gr.State([])
         gr.Markdown("""<center><font size=8>Step Audio Chat</center>""")
         gr.Markdown(
             """<font size=4>This preview demonstrates core functionalities. To unlock the cormplete real-time voice conversation system with end-to-end encryption and advanced features, download the [Yuewen APP](https://m.yuewen.cn/call-app) with the link or via QR Code.</font>"""
         )
-        with gr.Accordion(
-            label="Click to view the QR code ", open=False
-        ):
             gr.Image(
                 value="assets/yuewen.jpeg",
                 interactive=False,
@@ -161,7 +152,8 @@ def _launch_demo(args, tts_model):
                 show_fullscreen_button=False,
             )
         with gr.Accordion(
-            label="The performance of English prompts is not as stable as that of Chinese prompts. You can click here to change sys prompt.", open=False
         ):
             prompt_choice = gr.Radio(
                 choices=list(PROMPT_TEMPLATE.keys()),
@@ -222,7 +214,7 @@ def _launch_demo(args, tts_model):
                 print(f"update_examples error")
                 return chatbot, history
             else:
-                chatbot, history = predict(chatbot, history, tts_model, user_prompt, enable_asr)
                 print(f"update_examples done")
                 return chatbot, history
@@ -230,7 +222,13 @@ def _launch_demo(args, tts_model):
             gr.Examples(
                 fn=update_examples,
                 examples=CHAT_EXAMPLES,
-                inputs=[example_comment, example_text, example_audio, user_prompt, enable_asr],
                 outputs=[chatbot, history],
                 run_on_click=True,
             )
@@ -241,7 +239,7 @@ def _launch_demo(args, tts_model):
                 gr.Warning(error)
                 return chatbot, history, None, None
             else:
-                chatbot, history = predict(chatbot, history, tts_model, user_prompt, enable_asr)
                 return chatbot, history, None, None
         gen_btn.click(
@@ -266,7 +264,7 @@ def _launch_demo(args, tts_model):
             while history and history[-1]["role"] == "assistant":
                 print(f"discard {history[-1]}")
                 history.pop()
-            return predict(chatbot, history, tts_model, user_prompt, enable_asr)
         regen_btn.click(
             regenerate,
@@ -295,10 +293,4 @@ if __name__ == "__main__":
         "--server-name", type=str, default="0.0.0.0", help="Demo server name."
     )
     args = parser.parse_args()
-    tokenizer = StepAudioTokenizer(
-        os.path.join(args.model_path, "Step-Audio-Tokenizer")
-    )
-    tts_model = StepAudioTTS(
-        os.path.join(args.model_path, "Step-Audio-TTS-3B"), tokenizer
-    )
-    _launch_demo(args, tts_model)

 import time
 from pathlib import Path
+from yuewen_api import call_audiochat, call_asr, call_tts
 CACHE_DIR = "/tmp/gradio/"
+CACHE_CLEAN_AGE = 86400
 CHINESE_PROMPT_CONTENT = """你是一个为对话而设计的人工智能模型，目前无法连接到互联网。
+当你需要唱歌时，请以（哼唱）开头。当你需要rap或说唱时，请以（RAP）开头。当你需要快速说话时，请以（快速）开头。当你需要慢速说话时，请以（慢速）开头。
 现在，你需要倾听用户的语音内容，并以礼貌、简洁、口语化的文本进行回复。你需要尽量用户的语种进行回复。"""
 ENGLISH_PROMPT_CONTENT = """You are an AI designed for conversation, currently unable to connect to the internet.
     return chatbot, history, None
+def get_tmp_audio_path():
     import tempfile
+    temp_audio = tempfile.NamedTemporaryFile(dir=CACHE_DIR, delete=False, suffix=".mp3")
     return temp_audio.name
+def predict(chatbot, history, user_prompt, enable_asr):
     """Generate a response from the model."""
     start_time = time.time()
     try:
         text = call_audiochat(messages)
         print(f"predict {text=}")
+        audio_path = get_tmp_audio_path()
+        call_tts(text, audio_path)
         print(f"save_tmp_audio {audio_path=}")
         chatbot.append({"role": "assistant", "content": text})
         chatbot.append({"role": "assistant", "content": {"path": audio_path}})
     return chatbot, history
+def _launch_demo(args):
+    with gr.Blocks(delete_cache=(3600, CACHE_CLEAN_AGE)) as demo:
         # 保存 chat 历史，不需要每次再重新拼格式
         history = gr.State([])
         gr.Markdown("""<center><font size=8>Step Audio Chat</center>""")
         gr.Markdown(
             """<font size=4>This preview demonstrates core functionalities. To unlock the cormplete real-time voice conversation system with end-to-end encryption and advanced features, download the [Yuewen APP](https://m.yuewen.cn/call-app) with the link or via QR Code.</font>"""
         )
+        with gr.Accordion(label="Click to view the QR code ", open=False):
             gr.Image(
                 value="assets/yuewen.jpeg",
                 interactive=False,
                 show_fullscreen_button=False,
             )
         with gr.Accordion(
+            label="The performance of English prompts is not as stable as that of Chinese prompts. You can click here to change sys prompt.",
+            open=False,
         ):
             prompt_choice = gr.Radio(
                 choices=list(PROMPT_TEMPLATE.keys()),
                 print(f"update_examples error")
                 return chatbot, history
             else:
+                chatbot, history = predict(chatbot, history, user_prompt, enable_asr)
                 print(f"update_examples done")
                 return chatbot, history
             gr.Examples(
                 fn=update_examples,
                 examples=CHAT_EXAMPLES,
+                inputs=[
+                    example_comment,
+                    example_text,
+                    example_audio,
+                    user_prompt,
+                    enable_asr,
+                ],
                 outputs=[chatbot, history],
                 run_on_click=True,
             )
                 gr.Warning(error)
                 return chatbot, history, None, None
             else:
+                chatbot, history = predict(chatbot, history, user_prompt, enable_asr)
                 return chatbot, history, None, None
         gen_btn.click(
             while history and history[-1]["role"] == "assistant":
                 print(f"discard {history[-1]}")
                 history.pop()
+            return predict(chatbot, history, user_prompt, enable_asr)
         regen_btn.click(
             regenerate,
         "--server-name", type=str, default="0.0.0.0", help="Demo server name."
     )
     args = parser.parse_args()
+    _launch_demo(args)

cosyvoice/__init__.py DELETED Viewed

File without changes

cosyvoice/cli/__init__.py DELETED Viewed

File without changes

cosyvoice/cli/cosyvoice.py DELETED Viewed

@@ -1,68 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import uuid
-import time
-from tqdm import tqdm
-import torch
-import torchaudio
-from hyperpyyaml import load_hyperpyyaml
-from cosyvoice.cli.frontend import CosyVoiceFrontEnd
-from cosyvoice.cli.model import CosyVoiceModel
-class CosyVoice:
-    def __init__(
-        self,
-        model_dir,
-    ):
-        self.model_dir = model_dir
-        with open("{}/cosyvoice.yaml".format(model_dir), "r") as f:
-            configs = load_hyperpyyaml(f)
-        self.frontend = CosyVoiceFrontEnd(
-            configs["feat_extractor"],
-            "{}/campplus.onnx".format(model_dir),
-            "{}/speech_tokenizer_v1.onnx".format(model_dir),
-        )
-        self.model = CosyVoiceModel(configs["flow"], configs["hift"])
-        self.model.load(
-            "{}/flow.pt".format(model_dir),
-            "{}/hift.pt".format(model_dir),
-        )
-        self.model.flow = self.model.flow.to(torch.bfloat16)
-        del configs
-    def token_to_wav_offline(
-        self,
-        speech_token,
-        speech_feat,
-        speech_feat_len,
-        prompt_token,
-        prompt_token_len,
-        embedding,
-    ):
-        tts_mel = self.model.flow.inference(
-            token=speech_token.to(self.model.device),
-            token_len=torch.tensor([speech_token.size(1)], dtype=torch.int32).to(
-                self.model.device
-            ),
-            prompt_token=prompt_token.to(self.model.device),
-            prompt_token_len=prompt_token_len.to(self.model.device),
-            prompt_feat=speech_feat.to(self.model.device),
-            prompt_feat_len=speech_feat_len.to(self.model.device),
-            embedding=embedding.to(self.model.device),
-        )
-        tts_speech = self.model.hift.inference(mel=tts_mel.float())[0].cpu()
-        return tts_speech

cosyvoice/cli/frontend.py DELETED Viewed

@@ -1,106 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import onnxruntime
-import torch
-import numpy as np
-import whisper
-from typing import Callable
-import torchaudio.compliance.kaldi as kaldi
-class CosyVoiceFrontEnd:
-    def __init__(
-        self,
-        feat_extractor: Callable,
-        campplus_model: str,
-        speech_tokenizer_model: str,
-    ):
-        self.feat_extractor = feat_extractor
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        option = onnxruntime.SessionOptions()
-        option.graph_optimization_level = (
-            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        )
-        option.intra_op_num_threads = 1
-        self.campplus_session = onnxruntime.InferenceSession(
-            campplus_model, sess_options=option, providers=["CPUExecutionProvider"]
-        )
-        self.speech_tokenizer_session = onnxruntime.InferenceSession(
-            speech_tokenizer_model,
-            sess_options=option,
-            providers=[
-                (
-                    "CUDAExecutionProvider"
-                    if torch.cuda.is_available()
-                    else "CPUExecutionProvider"
-                )
-            ],
-        )
-    def _extract_speech_token(self, speech):
-        assert (
-            speech.shape[1] / 16000 <= 30
-        ), "do not support extract speech token for audio longer than 30s"
-        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
-        speech_token = (
-            self.speech_tokenizer_session.run(
-                None,
-                {
-                    self.speech_tokenizer_session.get_inputs()[0]
-                    .name: feat.detach()
-                    .cpu()
-                    .numpy(),
-                    self.speech_tokenizer_session.get_inputs()[1].name: np.array(
-                        [feat.shape[2]], dtype=np.int32
-                    ),
-                },
-            )[0]
-            .flatten()
-            .tolist()
-        )
-        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
-        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(
-            self.device
-        )
-        return speech_token, speech_token_len
-    def _extract_spk_embedding(self, speech):
-        feat = kaldi.fbank(speech, num_mel_bins=80, dither=0, sample_frequency=16000)
-        feat = feat - feat.mean(dim=0, keepdim=True)
-        embedding = (
-            self.campplus_session.run(
-                None,
-                {
-                    self.campplus_session.get_inputs()[0]
-                    .name: feat.unsqueeze(dim=0)
-                    .cpu()
-                    .numpy()
-                },
-            )[0]
-            .flatten()
-            .tolist()
-        )
-        embedding = torch.tensor([embedding]).to(self.device)
-        return embedding
-    def _extract_speech_feat(self, speech):
-        speech_feat = (
-            self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
-        )
-        speech_feat = speech_feat.unsqueeze(dim=0)
-        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(
-            self.device
-        )
-        return speech_feat, speech_feat_len

cosyvoice/cli/model.py DELETED Viewed

@@ -1,32 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-class CosyVoiceModel:
-    def __init__(
-        self,
-        flow: torch.nn.Module,
-        hift: torch.nn.Module,
-    ):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.flow = flow
-        self.hift = hift
-    def load(self, flow_model, hift_model):
-        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
-        self.flow.to(self.device).eval()
-        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
-        self.hift.to(self.device).eval()

cosyvoice/flow/decoder.py DELETED Viewed

@@ -1,238 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from einops import pack, rearrange, repeat
-from cosyvoice.matcha.decoder import (
-    SinusoidalPosEmb,
-    Block1D,
-    ResnetBlock1D,
-    Downsample1D,
-    TimestepEmbedding,
-    Upsample1D,
-)
-from cosyvoice.matcha.transformer import BasicTransformerBlock
-class ConditionalDecoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        channels=(256, 256),
-        dropout=0.05,
-        attention_head_dim=64,
-        n_blocks=1,
-        num_mid_blocks=2,
-        num_heads=4,
-        act_fn="snake",
-    ):
-        """
-        This decoder requires an input with the same shape of the target. So, if your text content
-        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
-        """
-        super().__init__()
-        channels = tuple(channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.time_embeddings = SinusoidalPosEmb(in_channels)
-        time_embed_dim = channels[0] * 4
-        self.time_mlp = TimestepEmbedding(
-            in_channels=in_channels,
-            time_embed_dim=time_embed_dim,
-            act_fn="silu",
-        )
-        self.down_blocks = nn.ModuleList([])
-        self.mid_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-        output_channel = in_channels
-        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
-            input_channel = output_channel
-            output_channel = channels[i]
-            is_last = i == len(channels) - 1
-            resnet = ResnetBlock1D(
-                dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            downsample = (
-                Downsample1D(output_channel)
-                if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.down_blocks.append(
-                nn.ModuleList([resnet, transformer_blocks, downsample])
-            )
-        for _ in range(num_mid_blocks):
-            input_channel = channels[-1]
-            out_channels = channels[-1]
-            resnet = ResnetBlock1D(
-                dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
-        channels = channels[::-1] + (channels[0],)
-        for i in range(len(channels) - 1):
-            input_channel = channels[i] * 2
-            output_channel = channels[i + 1]
-            is_last = i == len(channels) - 2
-            resnet = ResnetBlock1D(
-                dim=input_channel,
-                dim_out=output_channel,
-                time_emb_dim=time_embed_dim,
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            upsample = (
-                Upsample1D(output_channel, use_conv_transpose=True)
-                if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
-        self.final_block = Block1D(channels[-1], channels[-1])
-        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
-        self.initialize_weights()
-    def initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.GroupNorm):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-    def forward(self, x, mask, mu, t, spks=None, cond=None):
-        """Forward pass of the UNet1DConditional model.
-        Args:
-            x (torch.Tensor): shape (batch_size, in_channels, time)
-            mask (_type_): shape (batch_size, 1, time)
-            t (_type_): shape (batch_size)
-            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
-            cond (_type_, optional): placeholder for future use. Defaults to None.
-        Raises:
-            ValueError: _description_
-            ValueError: _description_
-        Returns:
-            _type_: _description_
-        """
-        t = self.time_embeddings(t).to(t.dtype)
-        t = self.time_mlp(t)
-        x = pack([x, mu], "b * t")[0]
-        if spks is not None:
-            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
-            x = pack([x, spks], "b * t")[0]
-        if cond is not None:
-            x = pack([x, cond], "b * t")[0]
-        hiddens = []
-        masks = [mask]
-        for resnet, transformer_blocks, downsample in self.down_blocks:
-            mask_down = masks[-1]
-            x = resnet(
-                x.to(torch.bfloat16), mask_down.to(torch.bfloat16), t.to(torch.bfloat16)
-            )
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    # attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            hiddens.append(x)  # Save hidden states for skip connections
-            x = downsample(x * mask_down)
-            masks.append(mask_down[:, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        for resnet, transformer_blocks in self.mid_blocks:
-            x = resnet(x, mask_mid, t)
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    # attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-        for resnet, transformer_blocks, upsample in self.up_blocks:
-            mask_up = masks.pop()
-            skip = hiddens.pop()
-            x = pack([x[:, :, : skip.shape[-1]], skip], "b * t")[0]
-            x = resnet(x, mask_up, t)
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    # attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            x = upsample(x * mask_up)
-        x = self.final_block(x, mask_up)
-        output = self.final_proj(x * mask_up)
-        return output * mask

cosyvoice/flow/flow.py DELETED Viewed

@@ -1,196 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import random
-from typing import Dict, Optional
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from omegaconf import DictConfig
-from cosyvoice.utils.mask import make_pad_mask
-import time
-class MaskedDiffWithXvec(torch.nn.Module):
-    def __init__(
-        self,
-        input_size: int = 512,
-        output_size: int = 80,
-        spk_embed_dim: int = 192,
-        output_type: str = "mel",
-        vocab_size: int = 4096,
-        input_frame_rate: int = 50,
-        only_mask_loss: bool = True,
-        encoder: torch.nn.Module = None,
-        length_regulator: torch.nn.Module = None,
-        decoder: torch.nn.Module = None,
-        decoder_conf: Dict = {
-            "in_channels": 240,
-            "out_channel": 80,
-            "spk_emb_dim": 80,
-            "n_spks": 1,
-            "cfm_params": DictConfig(
-                {
-                    "sigma_min": 1e-06,
-                    "solver": "euler",
-                    "t_scheduler": "cosine",
-                    "training_cfg_rate": 0.2,
-                    "inference_cfg_rate": 0.7,
-                    "reg_loss_type": "l1",
-                }
-            ),
-            "decoder_params": {
-                "channels": [256, 256],
-                "dropout": 0.0,
-                "attention_head_dim": 64,
-                "n_blocks": 4,
-                "num_mid_blocks": 12,
-                "num_heads": 8,
-                "act_fn": "gelu",
-            },
-        },
-        mel_feat_conf: Dict = {
-            "n_fft": 1024,
-            "num_mels": 80,
-            "sampling_rate": 22050,
-            "hop_size": 256,
-            "win_size": 1024,
-            "fmin": 0,
-            "fmax": 8000,
-        },
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.output_size = output_size
-        self.decoder_conf = decoder_conf
-        self.mel_feat_conf = mel_feat_conf
-        self.vocab_size = vocab_size
-        self.output_type = output_type
-        self.input_frame_rate = input_frame_rate
-        logging.info(f"input frame rate={self.input_frame_rate}")
-        self.input_embedding = nn.Embedding(vocab_size, input_size)
-        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
-        self.encoder = encoder
-        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
-        self.decoder = decoder
-        self.length_regulator = length_regulator
-        self.only_mask_loss = only_mask_loss
-    def forward(
-        self,
-        batch: dict,
-        device: torch.device,
-    ) -> Dict[str, Optional[torch.Tensor]]:
-        token = batch["speech_token"].to(device)
-        token_len = batch["speech_token_len"].to(device)
-        feat = batch["speech_feat"].to(device)
-        feat_len = batch["speech_feat_len"].to(device)
-        embedding = batch["embedding"].to(device)
-        # xvec projection
-        embedding = F.normalize(embedding, dim=1)
-        embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
-        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
-        # text encode
-        h, h_lengths = self.encoder(token, token_len)
-        h = self.encoder_proj(h)
-        h, h_lengths = self.length_regulator(h, feat_len)
-        # get conditions
-        conds = torch.zeros(feat.shape, device=token.device)
-        for i, j in enumerate(feat_len):
-            if random.random() < 0.5:
-                continue
-            index = random.randint(0, int(0.3 * j))
-            conds[i, :index] = feat[i, :index]
-        conds = conds.transpose(1, 2)
-        mask = (~make_pad_mask(feat_len)).to(h)
-        feat = F.interpolate(
-            feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest"
-        ).squeeze(dim=1)
-        loss, _ = self.decoder.compute_loss(
-            feat.transpose(1, 2).contiguous(),
-            mask.unsqueeze(1),
-            h.transpose(1, 2).contiguous(),
-            embedding,
-            cond=conds,
-        )
-        return {"loss": loss}
-    @torch.inference_mode()
-    def inference(
-        self,
-        token,
-        token_len,
-        prompt_token,
-        prompt_token_len,
-        prompt_feat,
-        prompt_feat_len,
-        embedding,
-    ):
-        assert token.shape[0] == 1
-        # xvec projection
-        embedding = F.normalize(embedding, dim=1)
-        embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
-        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
-        # text encode
-        token, token_len = (
-            torch.concat([prompt_token, token], dim=1),
-            prompt_token_len + token_len,
-        )
-        token = self.input_embedding(torch.clamp(token, min=0))
-        h, _ = self.encoder.inference(token, token_len)
-        h = self.encoder_proj(h)
-        mel_len1, mel_len2 = prompt_feat.shape[1], int(
-            token_len2
-            / self.input_frame_rate
-            * self.mel_feat_conf["sampling_rate"]
-            / self.mel_feat_conf["hop_size"]
-        )
-        h, _ = self.length_regulator.inference(
-            h[:, :token_len1],
-            h[:, token_len1:],
-            mel_len1,
-            mel_len2,
-        )
-        # get conditions
-        conds = torch.zeros(
-            [1, mel_len1 + mel_len2, self.output_size], device=token.device
-        )
-        conds[:, :mel_len1] = prompt_feat
-        conds = conds.transpose(1, 2)
-        # mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
-        mask = torch.ones(
-            [1, mel_len1 + mel_len2], device=h.device, dtype=torch.bfloat16
-        )
-        feat = self.decoder(
-            mu=h.transpose(1, 2).contiguous(),
-            mask=mask.unsqueeze(1),
-            spks=embedding,
-            cond=conds,
-            n_timesteps=10,
-        )
-        feat = feat[:, :, mel_len1:]
-        assert feat.shape[2] == mel_len2
-        return feat

cosyvoice/flow/flow_matching.py DELETED Viewed

@@ -1,315 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-import torch
-import torch.nn.functional as F
-from cosyvoice.matcha.flow_matching import BASECFM
-class ConditionalCFM(BASECFM):
-    def __init__(
-        self,
-        in_channels,
-        cfm_params,
-        n_spks=1,
-        spk_emb_dim=64,
-        estimator: torch.nn.Module = None,
-    ):
-        super().__init__(
-            n_feats=in_channels,
-            cfm_params=cfm_params,
-            n_spks=n_spks,
-            spk_emb_dim=spk_emb_dim,
-        )
-        self.t_scheduler = cfm_params.t_scheduler
-        self.training_cfg_rate = cfm_params.training_cfg_rate
-        self.inference_cfg_rate = cfm_params.inference_cfg_rate
-        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
-        # Just change the architecture of the estimator here
-        self.estimator = estimator
-        self.inference_graphs = {}
-        self.inference_buffers = {}
-        # self.capture_inference()
-    @torch.inference_mode()
-    def forward(
-        self,
-        mu,
-        mask,
-        n_timesteps,
-        temperature=1.0,
-        spks=None,
-        cond=None,
-    ):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
-        if self.t_scheduler == "cosine":
-            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
-        return self.solve_euler(
-            z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond
-        )
-    @torch.inference_mode()
-    def capture_inference(self, seq_len_to_capture=list(range(128, 512, 8))):
-        start_time = time.time()
-        print(
-            f"capture_inference for ConditionalCFM solve euler, seq_len_to_capture: {seq_len_to_capture}"
-        )
-        for seq_len in seq_len_to_capture:
-            static_z = torch.randn(
-                1, 80, seq_len, device=torch.device("cuda"), dtype=torch.bfloat16
-            )
-            static_t_span = torch.linspace(
-                0, 1, 11, device=torch.device("cuda"), dtype=torch.bfloat16
-            )  # only capture at 10 steps
-            static_mu = torch.randn(
-                1, 80, seq_len, device=torch.device("cuda"), dtype=torch.bfloat16
-            )
-            static_mask = torch.ones(
-                1, 1, seq_len, device=torch.device("cuda"), dtype=torch.bfloat16
-            )
-            static_spks = torch.randn(
-                1, 80, device=torch.device("cuda"), dtype=torch.bfloat16
-            )
-            static_cond = torch.randn(
-                1, 80, seq_len, device=torch.device("cuda"), dtype=torch.float32
-            )
-            static_out = torch.randn(
-                1, 80, seq_len, device=torch.device("cuda"), dtype=torch.bfloat16
-            )
-            self._solve_euler_impl(
-                static_z,
-                t_span=static_t_span,
-                mu=static_mu,
-                mask=static_mask,
-                spks=static_spks,
-                cond=static_cond,
-            )
-            torch.cuda.synchronize()
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                static_out = self._solve_euler_impl(
-                    static_z,
-                    t_span=static_t_span,
-                    mu=static_mu,
-                    mask=static_mask,
-                    spks=static_spks,
-                    cond=static_cond,
-                )
-        self.inference_buffers[seq_len] = {
-            "z": static_z,
-            "t_span": static_t_span,
-            "mu": static_mu,
-            "mask": static_mask,
-            "spks": static_spks,
-            "cond": static_cond,
-            "out": static_out,
-        }
-        self.inference_graphs[seq_len] = g
-        end_time = time.time()
-        print(
-            f"capture_inference for ConditionalCFM solve euler, time elapsed: {end_time - start_time}"
-        )
-    def solve_euler(self, x, t_span, mu, mask, spks, cond):
-        if hasattr(self, "inference_graphs") and len(self.inference_graphs) > 0:
-            curr_seq_len = x.shape[2]
-            available_lengths = sorted(list(self.inference_graphs.keys()))
-            if curr_seq_len <= max(available_lengths):
-                target_len = min(available_lengths, key=lambda x: abs(x - curr_seq_len))
-                if target_len == curr_seq_len:
-                    padded_x = x
-                    padded_mu = mu
-                    padded_mask = mask
-                    if cond is not None:
-                        padded_cond = cond
-                else:
-                    padded_x = torch.randn(
-                        (x.shape[0], x.shape[1], target_len),
-                        dtype=x.dtype,
-                        device=x.device,
-                    )
-                    padded_x[:, :, :curr_seq_len] = x
-                    padded_mu = torch.randn(
-                        (mu.shape[0], mu.shape[1], target_len),
-                        dtype=mu.dtype,
-                        device=mu.device,
-                    )
-                    padded_mu[:, :, :curr_seq_len] = mu
-                    # FIXME(ys): uses zeros and maskgroupnorm
-                    padded_mask = torch.ones(
-                        (mask.shape[0], mask.shape[1], target_len),
-                        dtype=mask.dtype,
-                        device=mask.device,
-                    )
-                    if cond is not None:
-                        padded_cond = torch.randn(
-                            (cond.shape[0], cond.shape[1], target_len),
-                            dtype=cond.dtype,
-                            device=cond.device,
-                        )
-                        padded_cond[:, :, :curr_seq_len] = cond
-                buffer = self.inference_buffers[target_len]
-                buffer["z"].copy_(padded_x)
-                buffer["t_span"].copy_(t_span)
-                buffer["mu"].copy_(padded_mu)
-                buffer["mask"].copy_(padded_mask)
-                buffer["spks"].copy_(spks)
-                if cond is not None:
-                    buffer["cond"].copy_(padded_cond)
-                self.inference_graphs[target_len].replay()
-                output = buffer["out"][:, :, :curr_seq_len]
-                return output
-        return self._solve_euler_impl(x, t_span, mu, mask, spks, cond)
-    def _solve_euler_impl(self, x, t_span, mu, mask, spks, cond):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        t = t.unsqueeze(dim=0)
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in range(1, len(t_span)):
-            if self.inference_cfg_rate > 0:
-                x_double = torch.cat([x, x], dim=0)
-                mask_double = torch.cat([mask, mask], dim=0)
-                mu_double = torch.cat([mu, torch.zeros_like(mu)], dim=0)
-                t_double = torch.cat([t, t], dim=0)
-                spks_double = (
-                    torch.cat([spks, torch.zeros_like(spks)], dim=0)
-                    if spks is not None
-                    else None
-                )
-                cond_double = torch.cat([cond, torch.zeros_like(cond)], dim=0)
-                dphi_dt_double = self.forward_estimator(
-                    x_double, mask_double, mu_double, t_double, spks_double, cond_double
-                )
-                dphi_dt, cfg_dphi_dt = torch.chunk(dphi_dt_double, 2, dim=0)
-                dphi_dt = (
-                    1.0 + self.inference_cfg_rate
-                ) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt
-            else:
-                dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def forward_estimator(self, x, mask, mu, t, spks, cond):
-        if isinstance(self.estimator, torch.nn.Module):
-            return self.estimator.forward(x, mask, mu, t, spks, cond)
-        else:
-            ort_inputs = {
-                "x": x.cpu().numpy(),
-                "mask": mask.cpu().numpy(),
-                "mu": mu.cpu().numpy(),
-                "t": t.cpu().numpy(),
-                "spks": spks.cpu().numpy(),
-                "cond": cond.cpu().numpy(),
-            }
-            output = self.estimator.run(None, ort_inputs)[0]
-            return torch.tensor(output, dtype=x.dtype, device=x.device)
-    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): target mask
-                shape: (batch_size, 1, mel_timesteps)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        b, _, t = mu.shape
-        # random timestep
-        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
-        if self.t_scheduler == "cosine":
-            t = 1 - torch.cos(t * 0.5 * torch.pi)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
-        if self.training_cfg_rate > 0:
-            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
-            mu = mu * cfg_mask.view(-1, 1, 1)
-            spks = spks * cfg_mask.view(-1, 1)
-            cond = cond * cfg_mask.view(-1, 1, 1)
-        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
-        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (
-            torch.sum(mask) * u.shape[1]
-        )
-        return loss, y

cosyvoice/flow/length_regulator.py DELETED Viewed

@@ -1,65 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Tuple
-import torch.nn as nn
-import torch
-from torch.nn import functional as F
-from cosyvoice.utils.mask import make_pad_mask
-class InterpolateRegulator(nn.Module):
-    def __init__(
-        self,
-        channels: int,
-        sampling_ratios: Tuple,
-        out_channels: int = None,
-        groups: int = 1,
-    ):
-        super().__init__()
-        self.sampling_ratios = sampling_ratios
-        out_channels = out_channels or channels
-        model = nn.ModuleList([])
-        if len(sampling_ratios) > 0:
-            for _ in sampling_ratios:
-                module = nn.Conv1d(channels, channels, 3, 1, 1)
-                norm = nn.GroupNorm(groups, channels)
-                act = nn.Mish()
-                model.extend([module, norm, act])
-        model.append(nn.Conv1d(channels, out_channels, 1, 1))
-        self.model = nn.Sequential(*model)
-    def forward(self, x, ylens=None):
-        # x in (B, T, D)
-        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
-        x = F.interpolate(
-            x.transpose(1, 2).contiguous(), size=ylens.max(), mode="linear"
-        )
-        out = self.model(x).transpose(1, 2).contiguous()
-        olens = ylens
-        return out * mask, olens
-    def inference(self, x1, x2, mel_len1, mel_len2):
-        # x in (B, T, D)
-        x2 = F.interpolate(
-            x2.transpose(1, 2).contiguous(), size=mel_len2, mode="linear"
-        )
-        if x1.shape[1] != 0:
-            x1 = F.interpolate(
-                x1.transpose(1, 2).contiguous(), size=mel_len1, mode="linear"
-            )
-            x = torch.concat([x1, x2], dim=2)
-        else:
-            x = x2
-        out = self.model(x).transpose(1, 2).contiguous()
-        return out, mel_len1 + mel_len2

cosyvoice/hifigan/f0_predictor.py DELETED Viewed

@@ -1,55 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from torch.nn.utils import weight_norm
-class ConvRNNF0Predictor(nn.Module):
-    def __init__(
-        self, num_class: int = 1, in_channels: int = 80, cond_channels: int = 512
-    ):
-        super().__init__()
-        self.num_class = num_class
-        self.condnet = nn.Sequential(
-            weight_norm(
-                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-        )
-        self.classifier = nn.Linear(
-            in_features=cond_channels, out_features=self.num_class
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.condnet(x)
-        x = x.transpose(1, 2)
-        return torch.abs(self.classifier(x).squeeze(-1))

cosyvoice/hifigan/generator.py DELETED Viewed

@@ -1,566 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""HIFI-GAN"""
-import typing as tp
-import time
-import numpy as np
-from scipy.signal import get_window
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Conv1d
-from torch.nn import ConvTranspose1d
-from torch.nn.utils import remove_weight_norm
-from torch.nn.utils import weight_norm
-from torch.distributions.uniform import Uniform
-from cosyvoice.transformer.activation import Snake
-from cosyvoice.utils.common import get_padding
-from cosyvoice.utils.common import init_weights
-"""hifigan based generator implementation.
-This code is modified from https://github.com/jik876/hifi-gan
- ,https://github.com/kan-bayashi/ParallelWaveGAN and
- https://github.com/NVIDIA/BigVGAN
-"""
-class ResBlock(torch.nn.Module):
-    """Residual block module in HiFiGAN/BigVGAN."""
-    def __init__(
-        self,
-        channels: int = 512,
-        kernel_size: int = 3,
-        dilations: tp.List[int] = [1, 3, 5],
-    ):
-        super(ResBlock, self).__init__()
-        self.convs1 = nn.ModuleList()
-        self.convs2 = nn.ModuleList()
-        for dilation in dilations:
-            self.convs1.append(
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation,
-                        padding=get_padding(kernel_size, dilation),
-                    )
-                )
-            )
-            self.convs2.append(
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                )
-            )
-        self.convs1.apply(init_weights)
-        self.convs2.apply(init_weights)
-        self.activations1 = nn.ModuleList(
-            [Snake(channels, alpha_logscale=False) for _ in range(len(self.convs1))]
-        )
-        self.activations2 = nn.ModuleList(
-            [Snake(channels, alpha_logscale=False) for _ in range(len(self.convs2))]
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for idx in range(len(self.convs1)):
-            xt = self.activations1[idx](x)
-            xt = self.convs1[idx](xt)
-            xt = self.activations2[idx](xt)
-            xt = self.convs2[idx](xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for idx in range(len(self.convs1)):
-            remove_weight_norm(self.convs1[idx])
-            remove_weight_norm(self.convs2[idx])
-class SineGen(torch.nn.Module):
-    """Definition of sine generator
-    SineGen(samp_rate, harmonic_num = 0,
-            sine_amp = 0.1, noise_std = 0.003,
-            voiced_threshold = 0,
-            flag_for_pulse=False)
-    samp_rate: sampling rate in Hz
-    harmonic_num: number of harmonic overtones (default 0)
-    sine_amp: amplitude of sine-wavefrom (default 0.1)
-    noise_std: std of Gaussian noise (default 0.003)
-    voiced_thoreshold: F0 threshold for U/V classification (default 0)
-    flag_for_pulse: this SinGen is used inside PulseGen (default False)
-    Note: when flag_for_pulse is True, the first time step of a voiced
-        segment is always sin(np.pi) or cos(0)
-    """
-    def __init__(
-        self,
-        samp_rate,
-        harmonic_num=0,
-        sine_amp=0.1,
-        noise_std=0.003,
-        voiced_threshold=0,
-    ):
-        super(SineGen, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = noise_std
-        self.harmonic_num = harmonic_num
-        self.sampling_rate = samp_rate
-        self.voiced_threshold = voiced_threshold
-    def _f02uv(self, f0):
-        # generate uv signal
-        uv = (f0 > self.voiced_threshold).type(torch.float32)
-        return uv
-    @torch.no_grad()
-    def forward(self, f0):
-        """
-        :param f0: [B, 1, sample_len], Hz
-        :return: [B, 1, sample_len]
-        """
-        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(
-            f0.device
-        )
-        for i in range(self.harmonic_num + 1):
-            F_mat[:, i : i + 1, :] = f0 * (i + 1) / self.sampling_rate
-        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
-        u_dist = Uniform(low=-np.pi, high=np.pi)
-        phase_vec = u_dist.sample(
-            sample_shape=(f0.size(0), self.harmonic_num + 1, 1)
-        ).to(F_mat.device)
-        phase_vec[:, 0, :] = 0
-        # generate sine waveforms
-        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
-        # generate uv signal
-        uv = self._f02uv(f0)
-        # noise: for unvoiced should be similar to sine_amp
-        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
-        # .       for voiced regions is self.noise_std
-        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-        noise = noise_amp * torch.randn_like(sine_waves)
-        # first: set the unvoiced part to 0 by uv
-        # then: additive noise
-        sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
-class SourceModuleHnNSF(torch.nn.Module):
-    """SourceModule for hn-nsf
-    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0)
-    sampling_rate: sampling_rate in Hz
-    harmonic_num: number of harmonic above F0 (default: 0)
-    sine_amp: amplitude of sine source signal (default: 0.1)
-    add_noise_std: std of additive Gaussian noise (default: 0.003)
-        note that amplitude of noise in unvoiced is decided
-        by sine_amp
-    voiced_threshold: threhold to set U/V given F0 (default: 0)
-    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-    F0_sampled (batchsize, length, 1)
-    Sine_source (batchsize, length, 1)
-    noise_source (batchsize, length 1)
-    uv (batchsize, length, 1)
-    """
-    def __init__(
-        self,
-        sampling_rate,
-        upsample_scale,
-        harmonic_num=0,
-        sine_amp=0.1,
-        add_noise_std=0.003,
-        voiced_threshod=0,
-    ):
-        super(SourceModuleHnNSF, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = add_noise_std
-        # to produce sine waveforms
-        self.l_sin_gen = SineGen(
-            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
-        )
-        # to merge source harmonics into a single excitation
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
-        self.l_tanh = torch.nn.Tanh()
-    def forward(self, x):
-        """
-        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-        F0_sampled (batchsize, length, 1)
-        Sine_source (batchsize, length, 1)
-        noise_source (batchsize, length 1)
-        """
-        # source for harmonic branch
-        with torch.no_grad():
-            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
-            sine_wavs = sine_wavs.transpose(1, 2)
-            uv = uv.transpose(1, 2)
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-        # source for noise branch, in the same shape as uv
-        noise = torch.randn_like(uv) * self.sine_amp / 3
-        return sine_merge, noise, uv
-class HiFTGenerator(nn.Module):
-    """
-    HiFTNet Generator: Neural Source Filter + ISTFTNet
-    https://arxiv.org/abs/2309.09493
-    """
-    def __init__(
-        self,
-        in_channels: int = 80,
-        base_channels: int = 512,
-        nb_harmonics: int = 8,
-        sampling_rate: int = 22050,
-        nsf_alpha: float = 0.1,
-        nsf_sigma: float = 0.003,
-        nsf_voiced_threshold: float = 10,
-        upsample_rates: tp.List[int] = [8, 8],
-        upsample_kernel_sizes: tp.List[int] = [16, 16],
-        istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4},
-        resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
-        resblock_dilation_sizes: tp.List[tp.List[int]] = [
-            [1, 3, 5],
-            [1, 3, 5],
-            [1, 3, 5],
-        ],
-        source_resblock_kernel_sizes: tp.List[int] = [7, 11],
-        source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]],
-        lrelu_slope: float = 0.1,
-        audio_limit: float = 0.99,
-        f0_predictor: torch.nn.Module = None,
-    ):
-        super(HiFTGenerator, self).__init__()
-        self.out_channels = 1
-        self.nb_harmonics = nb_harmonics
-        self.sampling_rate = sampling_rate
-        self.istft_params = istft_params
-        self.lrelu_slope = lrelu_slope
-        self.audio_limit = audio_limit
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.upsample_rates = upsample_rates
-        self.m_source = SourceModuleHnNSF(
-            sampling_rate=sampling_rate,
-            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
-            harmonic_num=nb_harmonics,
-            sine_amp=nsf_alpha,
-            add_noise_std=nsf_sigma,
-            voiced_threshod=nsf_voiced_threshold,
-        )
-        self.f0_upsamp = torch.nn.Upsample(
-            scale_factor=np.prod(upsample_rates) * istft_params["hop_len"]
-        )
-        self.conv_pre = weight_norm(Conv1d(in_channels, base_channels, 7, 1, padding=3))
-        # Up
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        base_channels // (2**i),
-                        base_channels // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        # Down
-        self.source_downs = nn.ModuleList()
-        self.source_resblocks = nn.ModuleList()
-        downsample_rates = [1] + upsample_rates[::-1][:-1]
-        downsample_cum_rates = np.cumprod(downsample_rates)
-        for i, (u, k, d) in enumerate(
-            zip(
-                downsample_cum_rates[::-1],
-                source_resblock_kernel_sizes,
-                source_resblock_dilation_sizes,
-            )
-        ):
-            if u == 1:
-                self.source_downs.append(
-                    Conv1d(
-                        istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1
-                    )
-                )
-            else:
-                self.source_downs.append(
-                    Conv1d(
-                        istft_params["n_fft"] + 2,
-                        base_channels // (2 ** (i + 1)),
-                        u * 2,
-                        u,
-                        padding=(u // 2),
-                    )
-                )
-            self.source_resblocks.append(
-                ResBlock(base_channels // (2 ** (i + 1)), k, d)
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = base_channels // (2 ** (i + 1))
-            for _, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(ResBlock(ch, k, d))
-        self.conv_post = weight_norm(
-            Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3)
-        )
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-        self.reflection_pad = nn.ReflectionPad1d((1, 0))
-        self.stft_window = torch.from_numpy(
-            get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32)
-        ).cuda()
-        self.f0_predictor = f0_predictor
-        self.inference_buffers = {}
-        self.inference_graphs = {}
-    def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
-        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
-        har_source, _, _ = self.m_source(f0)
-        return har_source.transpose(1, 2)
-    def _stft(self, x):
-        spec = torch.stft(
-            x,
-            self.istft_params["n_fft"],
-            self.istft_params["hop_len"],
-            self.istft_params["n_fft"],
-            window=self.stft_window,
-            return_complex=True,
-        )
-        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
-        return spec[..., 0], spec[..., 1]
-    def _istft(self, magnitude, phase):
-        magnitude = torch.clip(magnitude, max=1e2)
-        real = magnitude * torch.cos(phase)
-        img = magnitude * torch.sin(phase)
-        inverse_transform = torch.istft(
-            torch.complex(real, img),
-            self.istft_params["n_fft"],
-            self.istft_params["hop_len"],
-            self.istft_params["n_fft"],
-            window=self.stft_window,
-        )
-        return inverse_transform
-    def forward(
-        self, x: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)
-    ) -> torch.Tensor:
-        f0 = self.f0_predictor(x)
-        s = self._f02source(f0)
-        # use cache_source to avoid glitch
-        if cache_source.shape[2] != 0:
-            s[:, :, : cache_source.shape[2]] = cache_source
-        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
-        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, self.lrelu_slope)
-            x = self.ups[i](x)
-            if i == self.num_upsamples - 1:
-                x = self.reflection_pad(x)
-            # fusion
-            si = self.source_downs[i](s_stft)
-            si = self.source_resblocks[i](si)
-            x = x + si
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        magnitude = torch.exp(x[:, : self.istft_params["n_fft"] // 2 + 1, :])
-        phase = torch.sin(
-            x[:, self.istft_params["n_fft"] // 2 + 1 :, :]
-        )  # actually, sin is redundancy
-        x = self._istft(magnitude, phase)
-        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x, s
-    def remove_weight_norm(self):
-        print("Removing weight norm...")
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-        self.source_module.remove_weight_norm()
-        for l in self.source_downs:
-            remove_weight_norm(l)
-        for l in self.source_resblocks:
-            l.remove_weight_norm()
-    @torch.inference_mode()
-    def _inference_impl(self, mel: torch.Tensor, s_stft: torch.Tensor) -> torch.Tensor:
-        x = self.conv_pre(mel)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, self.lrelu_slope)
-            x = self.ups[i](x)
-            if i == self.num_upsamples - 1:
-                x = self.reflection_pad(x)
-            # fusion
-            si = self.source_downs[i](s_stft)
-            si = self.source_resblocks[i](si)
-            x = x + si
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        magnitude = torch.exp(x[:, : self.istft_params["n_fft"] // 2 + 1, :])
-        phase = torch.sin(
-            x[:, self.istft_params["n_fft"] // 2 + 1 :, :]
-        )  # actually, sin is redundancy
-        # print(f"mel: {mel.shape}, magnitude: {magnitude.shape}, phase: {phase.shape}")
-        return magnitude, phase
-    @torch.inference_mode()
-    def inference(
-        self, mel: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)
-    ) -> torch.Tensor:
-        curr_seq_len = mel.shape[2]
-        f0 = self.f0_predictor(mel)
-        s = self._f02source(f0)
-        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
-        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
-        target_len = None
-        for seq_len in sorted(self.inference_buffers.keys()):
-            if curr_seq_len <= seq_len:
-                target_len = seq_len
-                break
-        if target_len is not None:
-            buffer = self.inference_buffers[target_len]
-            if curr_seq_len < target_len:
-                padded_mel = torch.zeros_like(buffer["mel"])
-                padded_mel[:, :, :curr_seq_len] = mel
-                buffer["mel"].copy_(padded_mel)
-                padded_s_stft = torch.zeros_like(buffer["s_stft"])
-                cur_s_stft_len = s_stft.shape[2]
-                padded_s_stft[:, :, :cur_s_stft_len] = s_stft
-                buffer["s_stft"].copy_(padded_s_stft)
-            else:
-                buffer["mel"].copy_(mel)
-                buffer["s_stft"].copy_(s_stft)
-                cur_s_stft_len = s_stft.shape[2]
-            self.inference_graphs[target_len].replay()
-            magnitude, phase = (
-                buffer["magnitude"][:, :, :cur_s_stft_len],
-                buffer["phase"][:, :, :cur_s_stft_len],
-            )
-        else:
-            magnitude, phase = self._inference_impl(mel=mel, s_stft=s_stft)
-        x = self._istft(magnitude, phase)
-        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x, s
-    @torch.inference_mode()
-    def capture_inference(self, seq_len_to_capture=[64, 128, 256, 512, 1024]):
-        start_time = time.time()
-        print(
-            f"capture inference for HiFTGenerator with seq_len_to_capture: {seq_len_to_capture}"
-        )
-        for seq_len in seq_len_to_capture:
-            mel = torch.randn(
-                1, 80, seq_len, device=torch.device("cuda"), dtype=torch.float32
-            )
-            f0 = self.f0_predictor(mel)
-            s = self._f02source(f0)
-            s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
-            s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
-            magnitude, phase = self._inference_impl(mel=mel, s_stft=s_stft)
-            torch.cuda.synchronize()
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                magnitude, phase = self._inference_impl(mel=mel, s_stft=s_stft)
-            inference_buffer = {
-                "mel": mel,
-                "s_stft": s_stft,
-                "magnitude": magnitude,
-                "phase": phase,
-            }
-            self.inference_buffers[seq_len] = inference_buffer
-            self.inference_graphs[seq_len] = g
-        end_time = time.time()
-        print(
-            f"capture inference for HiFTGenerator with seq_len_to_capture: {seq_len_to_capture} takes {end_time - start_time} seconds"
-        )

cosyvoice/matcha/audio.py DELETED Viewed

@@ -1,90 +0,0 @@
-import numpy as np
-import torch
-import torch.utils.data
-from librosa.filters import mel as librosa_mel_fn
-from scipy.io.wavfile import read
-MAX_WAV_VALUE = 32768.0
-def load_wav(full_path):
-    sampling_rate, data = read(full_path)
-    return data, sampling_rate
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
-def dynamic_range_decompression(x, C=1):
-    return np.exp(x) / C
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression_torch(x, C=1):
-    return torch.exp(x) / C
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-def spectral_de_normalize_torch(magnitudes):
-    output = dynamic_range_decompression_torch(magnitudes)
-    return output
-mel_basis = {}
-hann_window = {}
-def mel_spectrogram(
-    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
-):
-    if torch.min(y) < -1.0:
-        print("min value is ", torch.min(y))
-    if torch.max(y) > 1.0:
-        print("max value is ", torch.max(y))
-    global mel_basis, hann_window  # pylint: disable=global-statement
-    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
-        mel = librosa_mel_fn(
-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
-        )
-        mel_basis[str(fmax) + "_" + str(y.device)] = (
-            torch.from_numpy(mel).float().to(y.device)
-        )
-        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_size,
-            window=hann_window[str(y.device)],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec

cosyvoice/matcha/decoder.py DELETED Viewed

@@ -1,511 +0,0 @@
-import math
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from conformer import ConformerBlock
-from diffusers.models.activations import get_activation
-from einops import pack, rearrange, repeat
-from cosyvoice.matcha.transformer import BasicTransformerBlock
-class SinusoidalPosEmb(torch.nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
-    def forward(self, x, scale=1000):
-        if x.ndim < 1:
-            x = x.unsqueeze(0)
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
-        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-class MaskedGroupNorm(nn.GroupNorm):
-    """
-    Masked verstion of the Group normalization.
-    Based on: https://github.com/ptrblck/pytorch_misc/blob/20e8ea93bd458b88f921a87e2d4001a4eb753a02/batch_norm_manual.py
-    Receives a N-dim tensor of sequence lengths per batch element
-    along with the regular input for masking.
-    Check pytorch's GroupNorm implementation for argument details.
-    """
-    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
-        super(MaskedGroupNorm, self).__init__(num_groups, num_channels, eps, affine)
-    def forward(self, inp, mask=None):
-        assert (
-            inp.shape[1] % self.num_groups == 0
-        ), "Feature size not divisible by groups"
-        # 计算有效长度
-        seq_lengths = mask.sum(-1, keepdim=True)  # [batch_size, 1]
-        # 将输入reshape为groups
-        features_per_group = inp.shape[1] // self.num_groups
-        inp_r = inp.reshape(
-            inp.shape[0], self.num_groups, features_per_group, inp.shape[-1]
-        )
-        mask_r = mask.unsqueeze(1)  # [batch_size, 1, 1, length]
-        # 计算masked mean和variance
-        masked_inp = inp_r * mask_r
-        n = seq_lengths * features_per_group  # 每组的有效元素数量
-        mean = masked_inp.sum([2, 3], keepdim=True) / (n.view(-1, 1, 1, 1) + 1e-5)
-        var = ((masked_inp - mean * mask_r) ** 2).sum([2, 3], keepdim=True) / (
-            n.view(-1, 1, 1, 1) + 1e-5
-        )
-        # 标准化
-        inp_r = (inp_r - mean) / (torch.sqrt(var + self.eps))
-        out = inp_r.reshape(inp.shape[0], self.num_channels, inp.shape[-1])
-        # 应用仿射变换
-        if self.affine:
-            out = out * self.weight[None, :, None] + self.bias[None, :, None]
-        return out
-class Block1D(torch.nn.Module):
-    def __init__(self, dim, dim_out, groups=8):
-        super().__init__()
-        self.block = torch.nn.Sequential(
-            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
-            torch.nn.GroupNorm(groups, dim_out),
-            # MaskedGroupNorm(groups, dim_out),
-            nn.Mish(),
-        )
-    def forward(self, x, mask):
-        output = self.block(x * mask)
-        return output * mask
-        return x * mask
-class ResnetBlock1D(torch.nn.Module):
-    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
-        super().__init__()
-        self.mlp = torch.nn.Sequential(
-            nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out)
-        )
-        self.block1 = Block1D(dim, dim_out, groups=groups)
-        self.block2 = Block1D(dim_out, dim_out, groups=groups)
-        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
-    def forward(self, x, mask, time_emb):
-        h = self.block1(x, mask)
-        h += self.mlp(time_emb).unsqueeze(-1)
-        h = self.block2(h, mask)
-        output = h + self.res_conv(x * mask)
-        return output
-class Downsample1D(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
-    def forward(self, x):
-        return self.conv(x)
-class TimestepEmbedding(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        time_embed_dim: int,
-        act_fn: str = "silu",
-        out_dim: int = None,
-        post_act_fn: Optional[str] = None,
-        cond_proj_dim=None,
-    ):
-        super().__init__()
-        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
-        if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
-        else:
-            self.cond_proj = None
-        self.act = get_activation(act_fn)
-        if out_dim is not None:
-            time_embed_dim_out = out_dim
-        else:
-            time_embed_dim_out = time_embed_dim
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
-        if post_act_fn is None:
-            self.post_act = None
-        else:
-            self.post_act = get_activation(post_act_fn)
-    def forward(self, sample, condition=None):
-        if condition is not None:
-            sample = sample + self.cond_proj(condition)
-        sample = self.linear_1(sample)
-        if self.act is not None:
-            sample = self.act(sample)
-        sample = self.linear_2(sample)
-        if self.post_act is not None:
-            sample = self.post_act(sample)
-        return sample
-class Upsample1D(nn.Module):
-    """A 1D upsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        use_conv_transpose (`bool`, default `False`):
-            option to use a convolution transpose.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-    """
-    def __init__(
-        self,
-        channels,
-        use_conv=False,
-        use_conv_transpose=True,
-        out_channels=None,
-        name="conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        self.conv = None
-        if use_conv_transpose:
-            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
-    def forward(self, inputs):
-        assert inputs.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(inputs)
-        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
-        if self.use_conv:
-            outputs = self.conv(outputs)
-        return outputs
-class ConformerWrapper(ConformerBlock):
-    def __init__(  # pylint: disable=useless-super-delegation
-        self,
-        *,
-        dim,
-        dim_head=64,
-        heads=8,
-        ff_mult=4,
-        conv_expansion_factor=2,
-        conv_kernel_size=31,
-        attn_dropout=0,
-        ff_dropout=0,
-        conv_dropout=0,
-        conv_causal=False,
-    ):
-        super().__init__(
-            dim=dim,
-            dim_head=dim_head,
-            heads=heads,
-            ff_mult=ff_mult,
-            conv_expansion_factor=conv_expansion_factor,
-            conv_kernel_size=conv_kernel_size,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            conv_dropout=conv_dropout,
-            conv_causal=conv_causal,
-        )
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        timestep=None,
-    ):
-        return super().forward(x=hidden_states, mask=attention_mask.bool())
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        channels=(256, 256),
-        dropout=0.05,
-        attention_head_dim=64,
-        n_blocks=1,
-        num_mid_blocks=2,
-        num_heads=4,
-        act_fn="snake",
-        down_block_type="transformer",
-        mid_block_type="transformer",
-        up_block_type="transformer",
-    ):
-        super().__init__()
-        channels = tuple(channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.time_embeddings = SinusoidalPosEmb(in_channels)
-        time_embed_dim = channels[0] * 4
-        self.time_mlp = TimestepEmbedding(
-            in_channels=in_channels,
-            time_embed_dim=time_embed_dim,
-            act_fn="silu",
-        )
-        self.down_blocks = nn.ModuleList([])
-        self.mid_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-        output_channel = in_channels
-        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
-            input_channel = output_channel
-            output_channel = channels[i]
-            is_last = i == len(channels) - 1
-            resnet = ResnetBlock1D(
-                dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        down_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            downsample = (
-                Downsample1D(output_channel)
-                if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.down_blocks.append(
-                nn.ModuleList([resnet, transformer_blocks, downsample])
-            )
-        for i in range(num_mid_blocks):
-            input_channel = channels[-1]
-            out_channels = channels[-1]
-            resnet = ResnetBlock1D(
-                dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        mid_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
-        channels = channels[::-1] + (channels[0],)
-        for i in range(len(channels) - 1):
-            input_channel = channels[i]
-            output_channel = channels[i + 1]
-            is_last = i == len(channels) - 2
-            resnet = ResnetBlock1D(
-                dim=2 * input_channel,
-                dim_out=output_channel,
-                time_emb_dim=time_embed_dim,
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        up_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            upsample = (
-                Upsample1D(output_channel, use_conv_transpose=True)
-                if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
-        self.final_block = Block1D(channels[-1], channels[-1])
-        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
-        self.initialize_weights()
-        # nn.init.normal_(self.final_proj.weight)
-    @staticmethod
-    def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
-        if block_type == "conformer":
-            block = ConformerWrapper(
-                dim=dim,
-                dim_head=attention_head_dim,
-                heads=num_heads,
-                ff_mult=1,
-                conv_expansion_factor=2,
-                ff_dropout=dropout,
-                attn_dropout=dropout,
-                conv_dropout=dropout,
-                conv_kernel_size=31,
-            )
-        elif block_type == "transformer":
-            block = BasicTransformerBlock(
-                dim=dim,
-                num_attention_heads=num_heads,
-                attention_head_dim=attention_head_dim,
-                dropout=dropout,
-                activation_fn=act_fn,
-            )
-        else:
-            raise ValueError(f"Unknown block type {block_type}")
-        return block
-    def initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.GroupNorm):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-    def forward(self, x, mask, mu, t, spks=None, cond=None):
-        """Forward pass of the UNet1DConditional model.
-        Args:
-            x (torch.Tensor): shape (batch_size, in_channels, time)
-            mask (_type_): shape (batch_size, 1, time)
-            t (_type_): shape (batch_size)
-            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
-            cond (_type_, optional): placeholder for future use. Defaults to None.
-        Raises:
-            ValueError: _description_
-            ValueError: _description_
-        Returns:
-            _type_: _description_
-        """
-        t = self.time_embeddings(t)
-        t = self.time_mlp(t)
-        x = pack([x, mu], "b * t")[0]
-        if spks is not None:
-            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
-            x = pack([x, spks], "b * t")[0]
-        hiddens = []
-        masks = [mask]
-        for resnet, transformer_blocks, downsample in self.down_blocks:
-            mask_down = masks[-1]
-            x = resnet(x, mask_down, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_down = rearrange(mask_down, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_down,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_down = rearrange(mask_down, "b t -> b 1 t")
-            hiddens.append(x)  # Save hidden states for skip connections
-            x = downsample(x * mask_down)
-            masks.append(mask_down[:, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        for resnet, transformer_blocks in self.mid_blocks:
-            x = resnet(x, mask_mid, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_mid = rearrange(mask_mid, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_mid,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_mid = rearrange(mask_mid, "b t -> b 1 t")
-        for resnet, transformer_blocks, upsample in self.up_blocks:
-            mask_up = masks.pop()
-            x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_up = rearrange(mask_up, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_up,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_up = rearrange(mask_up, "b t -> b 1 t")
-            x = upsample(x * mask_up)
-        x = self.final_block(x, mask_up)
-        output = self.final_proj(x * mask_up)
-        return output * mask

cosyvoice/matcha/flow_matching.py DELETED Viewed

@@ -1,141 +0,0 @@
-from abc import ABC
-import torch
-import torch.nn.functional as F
-from cosyvoice.matcha.decoder import Decoder
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        n_feats,
-        cfm_params,
-        n_spks=1,
-        spk_emb_dim=128,
-    ):
-        super().__init__()
-        self.n_feats = n_feats
-        self.n_spks = n_spks
-        self.spk_emb_dim = spk_emb_dim
-        self.solver = cfm_params.solver
-        if hasattr(cfm_params, "sigma_min"):
-            self.sigma_min = cfm_params.sigma_min
-        else:
-            self.sigma_min = 1e-4
-        self.estimator = None
-    @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(
-            z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond
-        )
-    def solve_euler(self, x, t_span, mu, mask, spks, cond):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in range(1, len(t_span)):
-            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): target mask
-                shape: (batch_size, 1, mel_timesteps)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        b, _, t = mu.shape
-        # random timestep
-        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        loss = F.mse_loss(
-            self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum"
-        ) / (torch.sum(mask) * u.shape[1])
-        return loss, y
-class CFM(BASECFM):
-    def __init__(
-        self,
-        in_channels,
-        out_channel,
-        cfm_params,
-        decoder_params,
-        n_spks=1,
-        spk_emb_dim=64,
-    ):
-        super().__init__(
-            n_feats=in_channels,
-            cfm_params=cfm_params,
-            n_spks=n_spks,
-            spk_emb_dim=spk_emb_dim,
-        )
-        in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
-        # Just change the architecture of the estimator here
-        self.estimator = Decoder(
-            in_channels=in_channels, out_channels=out_channel, **decoder_params
-        )

cosyvoice/matcha/transformer.py DELETED Viewed

@@ -1,443 +0,0 @@
-from typing import Any, Dict, Optional
-import torch
-import torch.nn as nn
-from diffusers.models.attention import (
-    GEGLU,
-    GELU,
-    AdaLayerNorm,
-    AdaLayerNormZero,
-    ApproximateGELU,
-)
-from diffusers.models.attention_processor import Attention
-from diffusers.models.lora import LoRACompatibleLinear
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-class SnakeBeta(nn.Module):
-    """
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    """
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        alpha=1.0,
-        alpha_trainable=True,
-        alpha_logscale=True,
-    ):
-        """
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        """
-        super().__init__()
-        self.in_features = (
-            out_features if isinstance(out_features, list) else [out_features]
-        )
-        self.proj = LoRACompatibleLinear(in_features, out_features)
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
-            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
-            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        """
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        """
-        x = self.proj(x)
-        if self.alpha_logscale:
-            alpha = torch.exp(self.alpha)
-            beta = torch.exp(self.beta)
-        else:
-            alpha = self.alpha
-            beta = self.beta
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(
-            torch.sin(x * alpha), 2
-        )
-        return x
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-    """
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim)
-        elif activation_fn == "snakebeta":
-            act_fn = SnakeBeta(dim, inner_dim)
-        self.net = nn.ModuleList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-    def forward(self, hidden_states):
-        for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-@maybe_allow_in_graph
-class BasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block.
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-    """
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-        self.use_ada_layer_norm_zero = (
-            num_embeds_ada_norm is not None
-        ) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (
-            num_embeds_ada_norm is not None
-        ) and norm_type == "ada_norm"
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
-        # 2. Cross-Attn
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm)
-                if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-            )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=(
-                    cross_attention_dim if not double_self_attention else None
-                ),
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-                # scale_qk=False, # uncomment this to not to use flash attention
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = None
-            self.attn2 = None
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.ff = FeedForward(
-            dim,
-            dropout=dropout,
-            activation_fn=activation_fn,
-            final_dropout=final_dropout,
-        )
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-    def forward_native(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-    ):
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = (
-            cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        )
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=(
-                encoder_hidden_states if self.only_cross_attention else None
-            ),
-            attention_mask=(
-                encoder_attention_mask if self.only_cross_attention else attention_mask
-            ),
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-        # 2. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep)
-                if self.use_ada_layer_norm
-                else self.norm2(hidden_states)
-            )
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = (
-                norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-            )
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
-                raise ValueError(
-                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-                )
-            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
-            ff_output = torch.cat(
-                [
-                    self.ff(hid_slice)
-                    for hid_slice in norm_hidden_states.chunk(
-                        num_chunks, dim=self._chunk_dim
-                    )
-                ],
-                dim=self._chunk_dim,
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = ff_output + hidden_states
-        return hidden_states
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-    ):
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = (
-            cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        )
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=(
-                encoder_hidden_states if self.only_cross_attention else None
-            ),
-            attention_mask=(
-                encoder_attention_mask if self.only_cross_attention else attention_mask
-            ),
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-        # 2. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep)
-                if self.use_ada_layer_norm
-                else self.norm2(hidden_states)
-            )
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = (
-                norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-            )
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
-                raise ValueError(
-                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-                )
-            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
-            ff_output = torch.cat(
-                [
-                    self.ff(hid_slice)
-                    for hid_slice in norm_hidden_states.chunk(
-                        num_chunks, dim=self._chunk_dim
-                    )
-                ],
-                dim=self._chunk_dim,
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = ff_output + hidden_states
-        return hidden_states

cosyvoice/transformer/__init__.py DELETED Viewed

File without changes

cosyvoice/transformer/activation.py DELETED Viewed

@@ -1,87 +0,0 @@
-# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
-#               2020 Northwestern Polytechnical University (Pengcheng Guo)
-#               2020 Mobvoi Inc (Binbin Zhang)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Swish() activation function for Conformer."""
-import torch
-from torch import nn, sin, pow
-from torch.nn import Parameter
-class Swish(torch.nn.Module):
-    """Construct an Swish object."""
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Return Swish activation function."""
-        return x * torch.sigmoid(x)
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class Snake(nn.Module):
-    """
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    """
-    def __init__(
-        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
-    ):
-        """
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        """
-        super(Snake, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        """
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        """
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x

cosyvoice/transformer/attention.py DELETED Viewed

@@ -1,322 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#               2022 Xingchen Song ([email protected])
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Multi-Head Attention layer definition."""
-import math
-from typing import Tuple
-import torch
-from torch import nn
-class MultiHeadedAttention(nn.Module):
-    """Multi-Head Attention layer.
-    Args:
-        n_head (int): The number of heads.
-        n_feat (int): The number of features.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, n_head: int, n_feat: int, dropout_rate: float, key_bias: bool = True
-    ):
-        """Construct an MultiHeadedAttention object."""
-        super().__init__()
-        assert n_feat % n_head == 0
-        # We assume d_v always equals d_k
-        self.d_k = n_feat // n_head
-        self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
-        self.dropout = nn.Dropout(p=dropout_rate)
-    def forward_qkv(
-        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Transform query, key and value.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-        Returns:
-            torch.Tensor: Transformed query tensor, size
-                (#batch, n_head, time1, d_k).
-            torch.Tensor: Transformed key tensor, size
-                (#batch, n_head, time2, d_k).
-            torch.Tensor: Transformed value tensor, size
-                (#batch, n_head, time2, d_k).
-        """
-        n_batch = query.size(0)
-        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
-        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
-        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
-        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
-        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
-        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
-        return q, k, v
-    def forward_attention(
-        self,
-        value: torch.Tensor,
-        scores: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-    ) -> torch.Tensor:
-        """Compute attention context vector.
-        Args:
-            value (torch.Tensor): Transformed value, size
-                (#batch, n_head, time2, d_k).
-            scores (torch.Tensor): Attention score, size
-                (#batch, n_head, time1, time2).
-            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
-                (#batch, time1, time2), (0, 0, 0) means fake mask.
-        Returns:
-            torch.Tensor: Transformed value (#batch, time1, d_model)
-                weighted by the attention score (#batch, time1, time2).
-        """
-        n_batch = value.size(0)
-        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
-        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
-        #           1st chunk to ease the onnx export.]
-        #   2. pytorch training
-        if mask.size(2) > 0:  # time2 > 0
-            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
-            # For last chunk, time2 might be larger than scores.size(-1)
-            mask = mask[:, :, :, : scores.size(-1)]  # (batch, 1, *, time2)
-            scores = scores.masked_fill(mask, -float("inf"))
-            attn = torch.softmax(scores, dim=-1).masked_fill(
-                mask, 0.0
-            )  # (batch, head, time1, time2)
-        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
-        #   1. onnx(16/-1, -1/-1, 16/0)
-        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
-        else:
-            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
-        p_attn = self.dropout(attn)
-        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = (
-            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
-        )  # (batch, time1, d_model)
-        return self.linear_out(x)  # (batch, time1, d_model)
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        pos_emb: torch.Tensor = torch.empty(0),
-        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute scaled dot product attention.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
-                (#batch, time1, time2).
-                1.When applying cross attention between decoder and encoder,
-                the batch padding mask for input is in (#batch, 1, T) shape.
-                2.When applying self attention of encoder,
-                the mask is in (#batch, T, T)  shape.
-                3.When applying self attention of decoder,
-                the mask is in (#batch, L, L)  shape.
-                4.If the different position in decoder see different block
-                of the encoder, such as Mocha, the passed in mask could be
-                in (#batch, L, T) shape. But there is no such case in current
-                CosyVoice.
-            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        Returns:
-            torch.Tensor: Output tensor (#batch, time1, d_model).
-            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        """
-        q, k, v = self.forward_qkv(query, key, value)
-        # NOTE(xcsong):
-        #   when export onnx model, for 1st chunk, we feed
-        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
-        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
-        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
-        #       and we will always do splitting and
-        #       concatnation(this will simplify onnx export). Note that
-        #       it's OK to concat & split zero-shaped tensors(see code below).
-        #   when export jit  model, for 1st chunk, we always feed
-        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
-        # >>> a = torch.ones((1, 2, 0, 4))
-        # >>> b = torch.ones((1, 2, 3, 4))
-        # >>> c = torch.cat((a, b), dim=2)
-        # >>> torch.equal(b, c)        # True
-        # >>> d = torch.split(a, 2, dim=-1)
-        # >>> torch.equal(d[0], d[1])  # True
-        if cache.size(0) > 0:
-            key_cache, value_cache = torch.split(cache, cache.size(-1) // 2, dim=-1)
-            k = torch.cat([key_cache, k], dim=2)
-            v = torch.cat([value_cache, v], dim=2)
-        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-        new_cache = torch.cat((k, v), dim=-1)
-        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
-        return self.forward_attention(v, scores, mask), new_cache
-class RelPositionMultiHeadedAttention(MultiHeadedAttention):
-    """Multi-Head Attention layer with relative position encoding.
-    Paper: https://arxiv.org/abs/1901.02860
-    Args:
-        n_head (int): The number of heads.
-        n_feat (int): The number of features.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, n_head: int, n_feat: int, dropout_rate: float, key_bias: bool = True
-    ):
-        """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head, n_feat, dropout_rate, key_bias)
-        # linear transformation for positional encoding
-        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
-        # these two learnable bias are used in matrix c and matrix d
-        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
-        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
-        torch.nn.init.xavier_uniform_(self.pos_bias_u)
-        torch.nn.init.xavier_uniform_(self.pos_bias_v)
-    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute relative positional encoding.
-        Args:
-            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
-            time1 means the length of query vector.
-        Returns:
-            torch.Tensor: Output tensor.
-        """
-        zero_pad = torch.zeros(
-            (x.size()[0], x.size()[1], x.size()[2], 1), device=x.device, dtype=x.dtype
-        )
-        x_padded = torch.cat([zero_pad, x], dim=-1)
-        x_padded = x_padded.view(x.size()[0], x.size()[1], x.size(3) + 1, x.size(2))
-        x = x_padded[:, :, 1:].view_as(x)[
-            :, :, :, : x.size(-1) // 2 + 1
-        ]  # only keep the positions from 0 to time2
-        return x
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        pos_emb: torch.Tensor = torch.empty(0),
-        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
-                (#batch, time1, time2), (0, 0, 0) means fake mask.
-            pos_emb (torch.Tensor): Positional embedding tensor
-                (#batch, time2, size).
-            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        Returns:
-            torch.Tensor: Output tensor (#batch, time1, d_model).
-            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        """
-        q, k, v = self.forward_qkv(query, key, value)
-        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
-        # NOTE(xcsong):
-        #   when export onnx model, for 1st chunk, we feed
-        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
-        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
-        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
-        #       and we will always do splitting and
-        #       concatnation(this will simplify onnx export). Note that
-        #       it's OK to concat & split zero-shaped tensors(see code below).
-        #   when export jit  model, for 1st chunk, we always feed
-        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
-        # >>> a = torch.ones((1, 2, 0, 4))
-        # >>> b = torch.ones((1, 2, 3, 4))
-        # >>> c = torch.cat((a, b), dim=2)
-        # >>> torch.equal(b, c)        # True
-        # >>> d = torch.split(a, 2, dim=-1)
-        # >>> torch.equal(d[0], d[1])  # True
-        if cache.size(0) > 0:
-            key_cache, value_cache = torch.split(cache, cache.size(-1) // 2, dim=-1)
-            k = torch.cat([key_cache, k], dim=2)
-            v = torch.cat([value_cache, v], dim=2)
-        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-        new_cache = torch.cat((k, v), dim=-1)
-        n_batch_pos = pos_emb.size(0)
-        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
-        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
-        # (batch, head, time1, d_k)
-        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
-        # (batch, head, time1, d_k)
-        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
-        # compute attention score
-        # first compute matrix a and matrix c
-        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-        # (batch, head, time1, time2)
-        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
-        # compute matrix b and matrix d
-        # (batch, head, time1, time2)
-        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
-        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
-        if matrix_ac.shape != matrix_bd.shape:
-            matrix_bd = self.rel_shift(matrix_bd)
-        scores = (matrix_ac + matrix_bd) / math.sqrt(
-            self.d_k
-        )  # (batch, head, time1, time2)
-        return self.forward_attention(v, scores, mask), new_cache

cosyvoice/transformer/convolution.py DELETED Viewed

@@ -1,147 +0,0 @@
-# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""ConvolutionModule definition."""
-from typing import Tuple
-import torch
-from torch import nn
-class ConvolutionModule(nn.Module):
-    """ConvolutionModule in Conformer model."""
-    def __init__(
-        self,
-        channels: int,
-        kernel_size: int = 15,
-        activation: nn.Module = nn.ReLU(),
-        norm: str = "batch_norm",
-        causal: bool = False,
-        bias: bool = True,
-    ):
-        """Construct an ConvolutionModule object.
-        Args:
-            channels (int): The number of channels of conv layers.
-            kernel_size (int): Kernel size of conv layers.
-            causal (int): Whether use causal convolution or not
-        """
-        super().__init__()
-        self.pointwise_conv1 = nn.Conv1d(
-            channels,
-            2 * channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-        # self.lorder is used to distinguish if it's a causal convolution,
-        # if self.lorder > 0: it's a causal convolution, the input will be
-        #    padded with self.lorder frames on the left in forward.
-        # else: it's a symmetrical convolution
-        if causal:
-            padding = 0
-            self.lorder = kernel_size - 1
-        else:
-            # kernel_size should be an odd number for none causal convolution
-            assert (kernel_size - 1) % 2 == 0
-            padding = (kernel_size - 1) // 2
-            self.lorder = 0
-        self.depthwise_conv = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size,
-            stride=1,
-            padding=padding,
-            groups=channels,
-            bias=bias,
-        )
-        assert norm in ["batch_norm", "layer_norm"]
-        if norm == "batch_norm":
-            self.use_layer_norm = False
-            self.norm = nn.BatchNorm1d(channels)
-        else:
-            self.use_layer_norm = True
-            self.norm = nn.LayerNorm(channels)
-        self.pointwise_conv2 = nn.Conv1d(
-            channels,
-            channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=bias,
-        )
-        self.activation = activation
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        cache: torch.Tensor = torch.zeros((0, 0, 0)),
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute convolution module.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, channels).
-            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
-                (0, 0, 0) means fake mask.
-            cache (torch.Tensor): left context cache, it is only
-                used in causal convolution (#batch, channels, cache_t),
-                (0, 0, 0) meas fake cache.
-        Returns:
-            torch.Tensor: Output tensor (#batch, time, channels).
-        """
-        # exchange the temporal dimension and the feature dimension
-        x = x.transpose(1, 2)  # (#batch, channels, time)
-        # mask batch padding
-        if mask_pad.size(2) > 0:  # time > 0
-            x.masked_fill_(~mask_pad, 0.0)
-        if self.lorder > 0:
-            if cache.size(2) == 0:  # cache_t == 0
-                x = nn.functional.pad(x, (self.lorder, 0), "constant", 0.0)
-            else:
-                assert cache.size(0) == x.size(0)  # equal batch
-                assert cache.size(1) == x.size(1)  # equal channel
-                x = torch.cat((cache, x), dim=2)
-            assert x.size(2) > self.lorder
-            new_cache = x[:, :, -self.lorder :]
-        else:
-            # It's better we just return None if no cache is required,
-            # However, for JIT export, here we just fake one tensor instead of
-            # None.
-            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
-        # GLU mechanism
-        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
-        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
-        # 1D Depthwise Conv
-        x = self.depthwise_conv(x)
-        if self.use_layer_norm:
-            x = x.transpose(1, 2)
-        x = self.activation(self.norm(x))
-        if self.use_layer_norm:
-            x = x.transpose(1, 2)
-        x = self.pointwise_conv2(x)
-        # mask batch padding
-        if mask_pad.size(2) > 0:  # time > 0
-            x.masked_fill_(~mask_pad, 0.0)
-        return x.transpose(1, 2), new_cache

cosyvoice/transformer/decoder.py DELETED Viewed

@@ -1,418 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Decoder definition."""
-from typing import Tuple, List, Optional
-import torch
-import torch.utils.checkpoint as ckpt
-import logging
-from cosyvoice.transformer.decoder_layer import DecoderLayer
-from cosyvoice.transformer.positionwise_feed_forward import (
-    PositionwiseFeedForward,
-)
-from cosyvoice.utils.class_utils import (
-    COSYVOICE_EMB_CLASSES,
-    COSYVOICE_ATTENTION_CLASSES,
-    COSYVOICE_ACTIVATION_CLASSES,
-)
-from cosyvoice.utils.mask import subsequent_mask, make_pad_mask
-class TransformerDecoder(torch.nn.Module):
-    """Base class of Transfomer decoder module.
-    Args:
-        vocab_size: output dim
-        encoder_output_size: dimension of attention
-        attention_heads: the number of heads of multi head attention
-        linear_units: the hidden units number of position-wise feedforward
-        num_blocks: the number of decoder blocks
-        dropout_rate: dropout rate
-        self_attention_dropout_rate: dropout rate for attention
-        input_layer: input layer type
-        use_output_layer: whether to use output layer
-        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
-        normalize_before:
-            True: use layer_norm before each sub-block of a layer.
-            False: use layer_norm after each sub-block of a layer.
-        src_attention: if false, encoder-decoder cross attention is not
-                       applied, such as CIF model
-        key_bias: whether use bias in attention.linear_k, False for whisper models.
-        gradient_checkpointing: rerunning a forward-pass segment for each
-            checkpointed segment during backward.
-        tie_word_embedding: Tie or clone module weights depending of whether we are
-            using TorchScript or not
-    """
-    def __init__(
-        self,
-        vocab_size: int,
-        encoder_output_size: int,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        self_attention_dropout_rate: float = 0.0,
-        src_attention_dropout_rate: float = 0.0,
-        input_layer: str = "embed",
-        use_output_layer: bool = True,
-        normalize_before: bool = True,
-        src_attention: bool = True,
-        key_bias: bool = True,
-        activation_type: str = "relu",
-        gradient_checkpointing: bool = False,
-        tie_word_embedding: bool = False,
-    ):
-        super().__init__()
-        attention_dim = encoder_output_size
-        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
-        self.embed = torch.nn.Sequential(
-            (
-                torch.nn.Identity()
-                if input_layer == "no_pos"
-                else torch.nn.Embedding(vocab_size, attention_dim)
-            ),
-            COSYVOICE_EMB_CLASSES[input_layer](attention_dim, positional_dropout_rate),
-        )
-        self.normalize_before = normalize_before
-        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
-        self.use_output_layer = use_output_layer
-        if use_output_layer:
-            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
-        else:
-            self.output_layer = torch.nn.Identity()
-        self.num_blocks = num_blocks
-        self.decoders = torch.nn.ModuleList(
-            [
-                DecoderLayer(
-                    attention_dim,
-                    COSYVOICE_ATTENTION_CLASSES["selfattn"](
-                        attention_heads,
-                        attention_dim,
-                        self_attention_dropout_rate,
-                        key_bias,
-                    ),
-                    (
-                        COSYVOICE_ATTENTION_CLASSES["selfattn"](
-                            attention_heads,
-                            attention_dim,
-                            src_attention_dropout_rate,
-                            key_bias,
-                        )
-                        if src_attention
-                        else None
-                    ),
-                    PositionwiseFeedForward(
-                        attention_dim, linear_units, dropout_rate, activation
-                    ),
-                    dropout_rate,
-                    normalize_before,
-                )
-                for _ in range(self.num_blocks)
-            ]
-        )
-        self.gradient_checkpointing = gradient_checkpointing
-        self.tie_word_embedding = tie_word_embedding
-    def forward(
-        self,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-        ys_in_pad: torch.Tensor,
-        ys_in_lens: torch.Tensor,
-        r_ys_in_pad: torch.Tensor = torch.empty(0),
-        reverse_weight: float = 0.0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward decoder.
-        Args:
-            memory: encoded memory, float32  (batch, maxlen_in, feat)
-            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
-            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
-            ys_in_lens: input lengths of this batch (batch)
-            r_ys_in_pad: not used in transformer decoder, in order to unify api
-                with bidirectional decoder
-            reverse_weight: not used in transformer decoder, in order to unify
-                api with bidirectional decode
-        Returns:
-            (tuple): tuple containing:
-                x: decoded token score before softmax (batch, maxlen_out,
-                    vocab_size) if use_output_layer is True,
-                torch.tensor(0.0), in order to unify api with bidirectional decoder
-                olens: (batch, )
-        NOTE(xcsong):
-            We pass the `__call__` method of the modules instead of `forward` to the
-            checkpointing API because `__call__` attaches all the hooks of the module.
-            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
-        """
-        tgt = ys_in_pad
-        maxlen = tgt.size(1)
-        # tgt_mask: (B, 1, L)
-        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
-        tgt_mask = tgt_mask.to(tgt.device)
-        # m: (1, L, L)
-        m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0)
-        # tgt_mask: (B, L, L)
-        tgt_mask = tgt_mask & m
-        x, _ = self.embed(tgt)
-        if self.gradient_checkpointing and self.training:
-            x = self.forward_layers_checkpointed(x, tgt_mask, memory, memory_mask)
-        else:
-            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
-        if self.normalize_before:
-            x = self.after_norm(x)
-        if self.use_output_layer:
-            x = self.output_layer(x)
-        olens = tgt_mask.sum(1)
-        return x, torch.tensor(0.0), olens
-    def forward_layers(
-        self,
-        x: torch.Tensor,
-        tgt_mask: torch.Tensor,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        for layer in self.decoders:
-            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, memory_mask)
-        return x
-    @torch.jit.unused
-    def forward_layers_checkpointed(
-        self,
-        x: torch.Tensor,
-        tgt_mask: torch.Tensor,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        for layer in self.decoders:
-            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
-                layer.__call__, x, tgt_mask, memory, memory_mask
-            )
-        return x
-    def forward_one_step(
-        self,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-        tgt: torch.Tensor,
-        tgt_mask: torch.Tensor,
-        cache: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
-        """Forward one step.
-            This is only used for decoding.
-        Args:
-            memory: encoded memory, float32  (batch, maxlen_in, feat)
-            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
-            tgt: input token ids, int64 (batch, maxlen_out)
-            tgt_mask: input token mask,  (batch, maxlen_out)
-                      dtype=torch.uint8 in PyTorch 1.2-
-                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
-            cache: cached output list of (batch, max_time_out-1, size)
-        Returns:
-            y, cache: NN output value and cache per `self.decoders`.
-            y.shape` is (batch, maxlen_out, token)
-        """
-        x, _ = self.embed(tgt)
-        new_cache = []
-        for i, decoder in enumerate(self.decoders):
-            if cache is None:
-                c = None
-            else:
-                c = cache[i]
-            x, tgt_mask, memory, memory_mask = decoder(
-                x, tgt_mask, memory, memory_mask, cache=c
-            )
-            new_cache.append(x)
-        if self.normalize_before:
-            y = self.after_norm(x[:, -1])
-        else:
-            y = x[:, -1]
-        if self.use_output_layer:
-            y = torch.log_softmax(self.output_layer(y), dim=-1)
-        return y, new_cache
-    def tie_or_clone_weights(self, jit_mode: bool = True):
-        """Tie or clone module weights (between word_emb and output_layer)
-        depending of whether we are using TorchScript or not"""
-        if not self.use_output_layer:
-            return
-        if jit_mode:
-            logging.info("clone emb.weight to output.weight")
-            self.output_layer.weight = torch.nn.Parameter(self.embed[0].weight.clone())
-        else:
-            logging.info("tie emb.weight with output.weight")
-            self.output_layer.weight = self.embed[0].weight
-        if getattr(self.output_layer, "bias", None) is not None:
-            self.output_layer.bias.data = torch.nn.functional.pad(
-                self.output_layer.bias.data,
-                (
-                    0,
-                    self.output_layer.weight.shape[0] - self.output_layer.bias.shape[0],
-                ),
-                "constant",
-                0,
-            )
-class BiTransformerDecoder(torch.nn.Module):
-    """Base class of Transfomer decoder module.
-    Args:
-        vocab_size: output dim
-        encoder_output_size: dimension of attention
-        attention_heads: the number of heads of multi head attention
-        linear_units: the hidden units number of position-wise feedforward
-        num_blocks: the number of decoder blocks
-        r_num_blocks: the number of right to left decoder blocks
-        dropout_rate: dropout rate
-        self_attention_dropout_rate: dropout rate for attention
-        input_layer: input layer type
-        use_output_layer: whether to use output layer
-        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
-        normalize_before:
-            True: use layer_norm before each sub-block of a layer.
-            False: use layer_norm after each sub-block of a layer.
-        key_bias: whether use bias in attention.linear_k, False for whisper models.
-    """
-    def __init__(
-        self,
-        vocab_size: int,
-        encoder_output_size: int,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        r_num_blocks: int = 0,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        self_attention_dropout_rate: float = 0.0,
-        src_attention_dropout_rate: float = 0.0,
-        input_layer: str = "embed",
-        use_output_layer: bool = True,
-        normalize_before: bool = True,
-        key_bias: bool = True,
-        gradient_checkpointing: bool = False,
-        tie_word_embedding: bool = False,
-    ):
-        super().__init__()
-        self.tie_word_embedding = tie_word_embedding
-        self.left_decoder = TransformerDecoder(
-            vocab_size,
-            encoder_output_size,
-            attention_heads,
-            linear_units,
-            num_blocks,
-            dropout_rate,
-            positional_dropout_rate,
-            self_attention_dropout_rate,
-            src_attention_dropout_rate,
-            input_layer,
-            use_output_layer,
-            normalize_before,
-            key_bias=key_bias,
-            gradient_checkpointing=gradient_checkpointing,
-            tie_word_embedding=tie_word_embedding,
-        )
-        self.right_decoder = TransformerDecoder(
-            vocab_size,
-            encoder_output_size,
-            attention_heads,
-            linear_units,
-            r_num_blocks,
-            dropout_rate,
-            positional_dropout_rate,
-            self_attention_dropout_rate,
-            src_attention_dropout_rate,
-            input_layer,
-            use_output_layer,
-            normalize_before,
-            key_bias=key_bias,
-            gradient_checkpointing=gradient_checkpointing,
-            tie_word_embedding=tie_word_embedding,
-        )
-    def forward(
-        self,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-        ys_in_pad: torch.Tensor,
-        ys_in_lens: torch.Tensor,
-        r_ys_in_pad: torch.Tensor,
-        reverse_weight: float = 0.0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward decoder.
-        Args:
-            memory: encoded memory, float32  (batch, maxlen_in, feat)
-            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
-            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
-            ys_in_lens: input lengths of this batch (batch)
-            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
-                used for right to left decoder
-            reverse_weight: used for right to left decoder
-        Returns:
-            (tuple): tuple containing:
-                x: decoded token score before softmax (batch, maxlen_out,
-                    vocab_size) if use_output_layer is True,
-                r_x: x: decoded token score (right to left decoder)
-                    before softmax (batch, maxlen_out, vocab_size)
-                    if use_output_layer is True,
-                olens: (batch, )
-        """
-        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, ys_in_lens)
-        r_x = torch.tensor(0.0)
-        if reverse_weight > 0.0:
-            r_x, _, olens = self.right_decoder(
-                memory, memory_mask, r_ys_in_pad, ys_in_lens
-            )
-        return l_x, r_x, olens
-    def forward_one_step(
-        self,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-        tgt: torch.Tensor,
-        tgt_mask: torch.Tensor,
-        cache: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
-        """Forward one step.
-            This is only used for decoding.
-        Args:
-            memory: encoded memory, float32  (batch, maxlen_in, feat)
-            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
-            tgt: input token ids, int64 (batch, maxlen_out)
-            tgt_mask: input token mask,  (batch, maxlen_out)
-                      dtype=torch.uint8 in PyTorch 1.2-
-                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
-            cache: cached output list of (batch, max_time_out-1, size)
-        Returns:
-            y, cache: NN output value and cache per `self.decoders`.
-            y.shape` is (batch, maxlen_out, token)
-        """
-        return self.left_decoder.forward_one_step(
-            memory, memory_mask, tgt, tgt_mask, cache
-        )
-    def tie_or_clone_weights(self, jit_mode: bool = True):
-        """Tie or clone module weights (between word_emb and output_layer)
-        depending of whether we are using TorchScript or not"""
-        self.left_decoder.tie_or_clone_weights(jit_mode)
-        self.right_decoder.tie_or_clone_weights(jit_mode)

cosyvoice/transformer/decoder_layer.py DELETED Viewed

@@ -1,132 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Decoder self-attention layer definition."""
-from typing import Optional, Tuple
-import torch
-from torch import nn
-class DecoderLayer(nn.Module):
-    """Single decoder layer module.
-    Args:
-        size (int): Input dimension.
-        self_attn (torch.nn.Module): Self-attention module instance.
-            `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (torch.nn.Module): Inter-attention module instance.
-            `MultiHeadedAttention` instance can be used as the argument.
-            If `None` is passed, Inter-attention is not used, such as
-            CIF, GPT, and other decoder only model.
-        feed_forward (torch.nn.Module): Feed-forward module instance.
-            `PositionwiseFeedForward` instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
-        normalize_before (bool):
-            True: use layer_norm before each sub-block.
-            False: to use layer_norm after each sub-block.
-    """
-    def __init__(
-        self,
-        size: int,
-        self_attn: nn.Module,
-        src_attn: Optional[nn.Module],
-        feed_forward: nn.Module,
-        dropout_rate: float,
-        normalize_before: bool = True,
-    ):
-        """Construct an DecoderLayer object."""
-        super().__init__()
-        self.size = size
-        self.self_attn = self_attn
-        self.src_attn = src_attn
-        self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, eps=1e-5)
-        self.norm2 = nn.LayerNorm(size, eps=1e-5)
-        self.norm3 = nn.LayerNorm(size, eps=1e-5)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.normalize_before = normalize_before
-    def forward(
-        self,
-        tgt: torch.Tensor,
-        tgt_mask: torch.Tensor,
-        memory: torch.Tensor,
-        memory_mask: torch.Tensor,
-        cache: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute decoded features.
-        Args:
-            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
-            tgt_mask (torch.Tensor): Mask for input tensor
-                (#batch, maxlen_out).
-            memory (torch.Tensor): Encoded memory
-                (#batch, maxlen_in, size).
-            memory_mask (torch.Tensor): Encoded memory mask
-                (#batch, maxlen_in).
-            cache (torch.Tensor): cached tensors.
-                (#batch, maxlen_out - 1, size).
-        Returns:
-            torch.Tensor: Output tensor (#batch, maxlen_out, size).
-            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
-            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
-            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
-        """
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm1(tgt)
-        if cache is None:
-            tgt_q = tgt
-            tgt_q_mask = tgt_mask
-        else:
-            # compute only the last frame query keeping dim: max_time_out -> 1
-            assert cache.shape == (
-                tgt.shape[0],
-                tgt.shape[1] - 1,
-                self.size,
-            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
-            tgt_q = tgt[:, -1:, :]
-            residual = residual[:, -1:, :]
-            tgt_q_mask = tgt_mask[:, -1:, :]
-        x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
-        if not self.normalize_before:
-            x = self.norm1(x)
-        if self.src_attn is not None:
-            residual = x
-            if self.normalize_before:
-                x = self.norm2(x)
-            x = residual + self.dropout(
-                self.src_attn(x, memory, memory, memory_mask)[0]
-            )
-            if not self.normalize_before:
-                x = self.norm2(x)
-        residual = x
-        if self.normalize_before:
-            x = self.norm3(x)
-        x = residual + self.dropout(self.feed_forward(x))
-        if not self.normalize_before:
-            x = self.norm3(x)
-        if cache is not None:
-            x = torch.cat([cache, x], dim=1)
-        return x, tgt_mask, memory, memory_mask

cosyvoice/transformer/embedding.py DELETED Viewed

@@ -1,293 +0,0 @@
-# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Positonal Encoding Module."""
-import math
-from typing import Tuple, Union
-import torch
-import torch.nn.functional as F
-import numpy as np
-class PositionalEncoding(torch.nn.Module):
-    """Positional encoding.
-    :param int d_model: embedding dim
-    :param float dropout_rate: dropout rate
-    :param int max_len: maximum input length
-    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
-    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
-    """
-    def __init__(
-        self,
-        d_model: int,
-        dropout_rate: float,
-        max_len: int = 5000,
-        reverse: bool = False,
-    ):
-        """Construct an PositionalEncoding object."""
-        super().__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-        self.max_len = max_len
-        self.pe = torch.zeros(self.max_len, self.d_model)
-        position = torch.arange(0, self.max_len, dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        self.pe[:, 0::2] = torch.sin(position * div_term)
-        self.pe[:, 1::2] = torch.cos(position * div_term)
-        self.pe = self.pe.unsqueeze(0)
-    def forward(
-        self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Add positional encoding.
-        Args:
-            x (torch.Tensor): Input. Its shape is (batch, time, ...)
-            offset (int, torch.tensor): position offset
-        Returns:
-            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
-            torch.Tensor: for compatibility to RelPositionalEncoding
-        """
-        self.pe = self.pe.to(x.device)
-        pos_emb = self.position_encoding(offset, x.size(1), False)
-        x = x * self.xscale + pos_emb
-        return self.dropout(x), self.dropout(pos_emb)
-    def position_encoding(
-        self, offset: Union[int, torch.Tensor], size: int, apply_dropout: bool = True
-    ) -> torch.Tensor:
-        """For getting encoding in a streaming fashion
-        Attention!!!!!
-        we apply dropout only once at the whole utterance level in a none
-        streaming way, but will call this function several times with
-        increasing input size in a streaming scenario, so the dropout will
-        be applied several times.
-        Args:
-            offset (int or torch.tensor): start offset
-            size (int): required size of position encoding
-        Returns:
-            torch.Tensor: Corresponding encoding
-        """
-        # How to subscript a Union type:
-        #   https://github.com/pytorch/pytorch/issues/69434
-        if isinstance(offset, int):
-            assert offset + size <= self.max_len
-            pos_emb = self.pe[:, offset : offset + size]
-        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
-            assert offset + size <= self.max_len
-            pos_emb = self.pe[:, offset : offset + size]
-        else:  # for batched streaming decoding on GPU
-            assert torch.max(offset) + size <= self.max_len
-            index = offset.unsqueeze(1) + torch.arange(0, size).to(
-                offset.device
-            )  # B X T
-            flag = index > 0
-            # remove negative offset
-            index = index * flag
-            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
-        if apply_dropout:
-            pos_emb = self.dropout(pos_emb)
-        return pos_emb
-class RelPositionalEncoding(PositionalEncoding):
-    """Relative positional encoding module.
-    See : Appendix B in https://arxiv.org/abs/1901.02860
-    Args:
-        d_model (int): Embedding dimension.
-        dropout_rate (float): Dropout rate.
-        max_len (int): Maximum input length.
-    """
-    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
-        """Initialize class."""
-        super().__init__(d_model, dropout_rate, max_len, reverse=True)
-    def forward(
-        self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute positional encoding.
-        Args:
-            x (torch.Tensor): Input tensor (batch, time, `*`).
-        Returns:
-            torch.Tensor: Encoded tensor (batch, time, `*`).
-            torch.Tensor: Positional embedding tensor (1, time, `*`).
-        """
-        self.pe = self.pe.to(x.device)
-        x = x * self.xscale
-        pos_emb = self.position_encoding(offset, x.size(1), False)
-        return self.dropout(x), self.dropout(pos_emb)
-class WhisperPositionalEncoding(PositionalEncoding):
-    """Sinusoids position encoding used in openai-whisper.encoder"""
-    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
-        super().__init__(d_model, dropout_rate, max_len)
-        self.xscale = 1.0
-        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
-        inv_timescales = torch.exp(
-            -log_timescale_increment * torch.arange(d_model // 2)
-        )
-        scaled_time = (
-            torch.arange(max_len)[:, np.newaxis] * inv_timescales[np.newaxis, :]
-        )
-        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
-        delattr(self, "pe")
-        self.register_buffer("pe", pe.unsqueeze(0))
-class LearnablePositionalEncoding(PositionalEncoding):
-    """Learnable position encoding used in openai-whisper.decoder"""
-    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
-        super().__init__(d_model, dropout_rate, max_len)
-        # NOTE(xcsong): overwrite self.pe & self.xscale
-        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
-        self.xscale = 1.0
-class NoPositionalEncoding(torch.nn.Module):
-    """No position encoding"""
-    def __init__(self, d_model: int, dropout_rate: float):
-        super().__init__()
-        self.d_model = d_model
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-    def forward(
-        self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Just return zero vector for interface compatibility"""
-        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
-        return self.dropout(x), pos_emb
-    def position_encoding(
-        self, offset: Union[int, torch.Tensor], size: int
-    ) -> torch.Tensor:
-        return torch.zeros(1, size, self.d_model)
-class EspnetRelPositionalEncoding(torch.nn.Module):
-    """Relative positional encoding module (new implementation).
-    Details can be found in https://github.com/espnet/espnet/pull/2816.
-    See : Appendix B in https://arxiv.org/abs/1901.02860
-    Args:
-        d_model (int): Embedding dimension.
-        dropout_rate (float): Dropout rate.
-        max_len (int): Maximum input length.
-    """
-    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
-        """Construct an PositionalEncoding object."""
-        super(EspnetRelPositionalEncoding, self).__init__()
-        self.d_model = d_model
-        self.xscale = math.sqrt(self.d_model)
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
-    def extend_pe(self, x: torch.Tensor):
-        """Reset the positional encodings."""
-        if self.pe is not None:
-            # self.pe contains both positive and negative parts
-            # the length of self.pe is 2 * input_len - 1
-            if self.pe.size(1) >= x.size(1) * 2 - 1:
-                if self.pe.dtype != x.dtype or self.pe.device != x.device:
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        # Suppose `i` means to the position of query vecotr and `j` means the
-        # position of key vector. We use position relative positions when keys
-        # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(x.size(1), self.d_model)
-        pe_negative = torch.zeros(x.size(1), self.d_model)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.d_model)
-        )
-        pe_positive[:, 0::2] = torch.sin(position * div_term)
-        pe_positive[:, 1::2] = torch.cos(position * div_term)
-        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
-        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
-        # Reserve the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick
-        # as in https://arxiv.org/abs/1901.02860
-        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
-        pe_negative = pe_negative[1:].unsqueeze(0)
-        pe = torch.cat([pe_positive, pe_negative], dim=1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
-    def forward(
-        self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Add positional encoding.
-        Args:
-            x (torch.Tensor): Input tensor (batch, time, `*`).
-        Returns:
-            torch.Tensor: Encoded tensor (batch, time, `*`).
-        """
-        self.extend_pe(x)
-        x = x * self.xscale
-        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
-        return self.dropout(x), self.dropout(pos_emb)
-    def position_encoding(
-        self, offset: Union[int, torch.Tensor], size: int
-    ) -> torch.Tensor:
-        """For getting encoding in a streaming fashion
-        Attention!!!!!
-        we apply dropout only once at the whole utterance level in a none
-        streaming way, but will call this function several times with
-        increasing input size in a streaming scenario, so the dropout will
-        be applied several times.
-        Args:
-            offset (int or torch.tensor): start offset
-            size (int): required size of position encoding
-        Returns:
-            torch.Tensor: Corresponding encoding
-        """
-        pos_emb = self.pe[
-            :,
-            self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
-        ]
-        return pos_emb

cosyvoice/transformer/encoder.py DELETED Viewed

@@ -1,633 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
-#               2022 Xingchen Song ([email protected])
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Encoder definition."""
-from typing import Tuple
-import time
-import torch
-import torch.utils.checkpoint as ckpt
-import torch.nn.functional as F
-from cosyvoice.transformer.convolution import ConvolutionModule
-from cosyvoice.transformer.encoder_layer import (
-    TransformerEncoderLayer,
-)
-from cosyvoice.transformer.encoder_layer import (
-    ConformerEncoderLayer,
-)
-from cosyvoice.transformer.positionwise_feed_forward import (
-    PositionwiseFeedForward,
-)
-from cosyvoice.utils.class_utils import (
-    COSYVOICE_EMB_CLASSES,
-    COSYVOICE_SUBSAMPLE_CLASSES,
-    COSYVOICE_ATTENTION_CLASSES,
-    COSYVOICE_ACTIVATION_CLASSES,
-)
-from cosyvoice.utils.mask import make_pad_mask
-from cosyvoice.utils.mask import add_optional_chunk_mask
-class BaseEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int = 256,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-        input_layer: str = "conv2d",
-        pos_enc_layer_type: str = "abs_pos",
-        normalize_before: bool = True,
-        static_chunk_size: int = 0,
-        use_dynamic_chunk: bool = False,
-        global_cmvn: torch.nn.Module = None,
-        use_dynamic_left_chunk: bool = False,
-        gradient_checkpointing: bool = False,
-    ):
-        """
-        Args:
-            input_size (int): input dim
-            output_size (int): dimension of attention
-            attention_heads (int): the number of heads of multi head attention
-            linear_units (int): the hidden units number of position-wise feed
-                forward
-            num_blocks (int): the number of decoder blocks
-            dropout_rate (float): dropout rate
-            attention_dropout_rate (float): dropout rate in attention
-            positional_dropout_rate (float): dropout rate after adding
-                positional encoding
-            input_layer (str): input layer type.
-                optional [linear, conv2d, conv2d6, conv2d8]
-            pos_enc_layer_type (str): Encoder positional encoding layer type.
-                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
-            normalize_before (bool):
-                True: use layer_norm before each sub-block of a layer.
-                False: use layer_norm after each sub-block of a layer.
-            static_chunk_size (int): chunk size for static chunk training and
-                decoding
-            use_dynamic_chunk (bool): whether use dynamic chunk size for
-                training or not, You can only use fixed chunk(chunk_size > 0)
-                or dyanmic chunk size(use_dynamic_chunk = True)
-            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
-            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
-                dynamic chunk training
-            key_bias: whether use bias in attention.linear_k, False for whisper models.
-            gradient_checkpointing: rerunning a forward-pass segment for each
-                checkpointed segment during backward.
-        """
-        super().__init__()
-        self._output_size = output_size
-        self.global_cmvn = global_cmvn
-        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
-            input_size,
-            output_size,
-            dropout_rate,
-            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
-                output_size, positional_dropout_rate
-            ),
-        )
-        self.normalize_before = normalize_before
-        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
-        self.static_chunk_size = static_chunk_size
-        self.use_dynamic_chunk = use_dynamic_chunk
-        self.use_dynamic_left_chunk = use_dynamic_left_chunk
-        self.gradient_checkpointing = gradient_checkpointing
-    def output_size(self) -> int:
-        return self._output_size
-    def forward(
-        self,
-        xs: torch.Tensor,
-        xs_lens: torch.Tensor,
-        decoding_chunk_size: int = 0,
-        num_decoding_left_chunks: int = -1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Embed positions in tensor.
-        Args:
-            xs: padded input tensor (B, T, D)
-            xs_lens: input length (B)
-            decoding_chunk_size: decoding chunk size for dynamic chunk
-                0: default for training, use random dynamic chunk.
-                <0: for decoding, use full chunk.
-                >0: for decoding, use fixed chunk size as set.
-            num_decoding_left_chunks: number of left chunks, this is for decoding,
-            the chunk size is decoding_chunk_size.
-                >=0: use num_decoding_left_chunks
-                <0: use all left chunks
-        Returns:
-            encoder output tensor xs, and subsampled masks
-            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
-            masks: torch.Tensor batch padding mask after subsample
-                (B, 1, T' ~= T/subsample_rate)
-        NOTE(xcsong):
-            We pass the `__call__` method of the modules instead of `forward` to the
-            checkpointing API because `__call__` attaches all the hooks of the module.
-            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
-        """
-        T = xs.size(1)
-        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
-        if self.global_cmvn is not None:
-            xs = self.global_cmvn(xs)
-        xs, pos_emb, masks = self.embed(xs, masks)
-        mask_pad = masks  # (B, 1, T/subsample_rate)
-        chunk_masks = add_optional_chunk_mask(
-            xs,
-            masks,
-            self.use_dynamic_chunk,
-            self.use_dynamic_left_chunk,
-            decoding_chunk_size,
-            self.static_chunk_size,
-            num_decoding_left_chunks,
-        )
-        print(f"chunk_masks shape: {chunk_masks.shape}")
-        if self.gradient_checkpointing and self.training:
-            xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb, mask_pad)
-        else:
-            xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        # Here we assume the mask is not changed in encoder layers, so just
-        # return the masks before encoder layers, and the masks will be used
-        # for cross attention with decoder later
-        return xs, masks
-    def forward_layers(
-        self,
-        xs: torch.Tensor,
-        chunk_masks: torch.Tensor,
-        pos_emb: torch.Tensor,
-        mask_pad: torch.Tensor,
-    ) -> torch.Tensor:
-        for layer in self.encoders:
-            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
-        return xs
-    @torch.jit.unused
-    def forward_layers_checkpointed(
-        self,
-        xs: torch.Tensor,
-        chunk_masks: torch.Tensor,
-        pos_emb: torch.Tensor,
-        mask_pad: torch.Tensor,
-    ) -> torch.Tensor:
-        for layer in self.encoders:
-            xs, chunk_masks, _, _ = ckpt.checkpoint(
-                layer.__call__, xs, chunk_masks, pos_emb, mask_pad
-            )
-        return xs
-    @torch.jit.export
-    def forward_chunk(
-        self,
-        xs: torch.Tensor,
-        offset: int,
-        required_cache_size: int,
-        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """ Forward just one chunk
-        Args:
-            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
-                where `time == (chunk_size - 1) * subsample_rate + \
-                        subsample.right_context + 1`
-            offset (int): current offset in encoder output time stamp
-            required_cache_size (int): cache size required for next chunk
-                compuation
-                >=0: actual cache size
-                <0: means all history cache is required
-            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
-                transformer/conformer attention, with shape
-                (elayers, head, cache_t1, d_k * 2), where
-                `head * d_k == hidden-dim` and
-                `cache_t1 == chunk_size * num_decoding_left_chunks`.
-            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
-                (elayers, b=1, hidden-dim, cache_t2), where
-                `cache_t2 == cnn.lorder - 1`
-        Returns:
-            torch.Tensor: output of current input xs,
-                with shape (b=1, chunk_size, hidden-dim).
-            torch.Tensor: new attention cache required for next chunk, with
-                dynamic shape (elayers, head, ?, d_k * 2)
-                depending on required_cache_size.
-            torch.Tensor: new conformer cnn cache required for next chunk, with
-                same shape as the original cnn_cache.
-        """
-        assert xs.size(0) == 1
-        # tmp_masks is just for interface compatibility
-        tmp_masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
-        if self.global_cmvn is not None:
-            xs = self.global_cmvn(xs)
-        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
-        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
-        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
-        chunk_size = xs.size(1)
-        attention_key_size = cache_t1 + chunk_size
-        pos_emb = self.embed.position_encoding(
-            offset=offset - cache_t1, size=attention_key_size
-        )
-        if required_cache_size < 0:
-            next_cache_start = 0
-        elif required_cache_size == 0:
-            next_cache_start = attention_key_size
-        else:
-            next_cache_start = max(attention_key_size - required_cache_size, 0)
-        r_att_cache = []
-        r_cnn_cache = []
-        for i, layer in enumerate(self.encoders):
-            # NOTE(xcsong): Before layer.forward
-            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
-            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
-            xs, _, new_att_cache, new_cnn_cache = layer(
-                xs,
-                att_mask,
-                pos_emb,
-                att_cache=att_cache[i : i + 1] if elayers > 0 else att_cache,
-                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache,
-            )
-            # NOTE(xcsong): After layer.forward
-            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
-            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
-            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
-            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
-        #   ? may be larger than cache_t1, it depends on required_cache_size
-        r_att_cache = torch.cat(r_att_cache, dim=0)
-        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
-        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
-        return (xs, r_att_cache, r_cnn_cache)
-    @torch.jit.unused
-    def forward_chunk_by_chunk(
-        self,
-        xs: torch.Tensor,
-        decoding_chunk_size: int,
-        num_decoding_left_chunks: int = -1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Forward input chunk by chunk with chunk_size like a streaming
-            fashion
-        Here we should pay special attention to computation cache in the
-        streaming style forward chunk by chunk. Three things should be taken
-        into account for computation in the current network:
-            1. transformer/conformer encoder layers output cache
-            2. convolution in conformer
-            3. convolution in subsampling
-        However, we don't implement subsampling cache for:
-            1. We can control subsampling module to output the right result by
-               overlapping input instead of cache left context, even though it
-               wastes some computation, but subsampling only takes a very
-               small fraction of computation in the whole model.
-            2. Typically, there are several covolution layers with subsampling
-               in subsampling module, it is tricky and complicated to do cache
-               with different convolution layers with different subsampling
-               rate.
-            3. Currently, nn.Sequential is used to stack all the convolution
-               layers in subsampling, we need to rewrite it to make it work
-               with cache, which is not preferred.
-        Args:
-            xs (torch.Tensor): (1, max_len, dim)
-            chunk_size (int): decoding chunk size
-        """
-        assert decoding_chunk_size > 0
-        # The model is trained by static or dynamic chunk
-        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
-        subsampling = self.embed.subsampling_rate
-        context = self.embed.right_context + 1  # Add current frame
-        stride = subsampling * decoding_chunk_size
-        decoding_window = (decoding_chunk_size - 1) * subsampling + context
-        num_frames = xs.size(1)
-        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
-        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
-        outputs = []
-        offset = 0
-        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
-        # Feed forward overlap input step by step
-        for cur in range(0, num_frames - context + 1, stride):
-            end = min(cur + decoding_window, num_frames)
-            chunk_xs = xs[:, cur:end, :]
-            (y, att_cache, cnn_cache) = self.forward_chunk(
-                chunk_xs, offset, required_cache_size, att_cache, cnn_cache
-            )
-            outputs.append(y)
-            offset += y.size(1)
-        ys = torch.cat(outputs, 1)
-        masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool)
-        return ys, masks
-class TransformerEncoder(BaseEncoder):
-    """Transformer encoder module."""
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int = 256,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-        input_layer: str = "conv2d",
-        pos_enc_layer_type: str = "abs_pos",
-        normalize_before: bool = True,
-        static_chunk_size: int = 0,
-        use_dynamic_chunk: bool = False,
-        global_cmvn: torch.nn.Module = None,
-        use_dynamic_left_chunk: bool = False,
-        key_bias: bool = True,
-        selfattention_layer_type: str = "selfattn",
-        activation_type: str = "relu",
-        gradient_checkpointing: bool = False,
-    ):
-        """Construct TransformerEncoder
-        See Encoder for the meaning of each parameter.
-        """
-        super().__init__(
-            input_size,
-            output_size,
-            attention_heads,
-            linear_units,
-            num_blocks,
-            dropout_rate,
-            positional_dropout_rate,
-            attention_dropout_rate,
-            input_layer,
-            pos_enc_layer_type,
-            normalize_before,
-            static_chunk_size,
-            use_dynamic_chunk,
-            global_cmvn,
-            use_dynamic_left_chunk,
-            gradient_checkpointing,
-        )
-        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
-        self.encoders = torch.nn.ModuleList(
-            [
-                TransformerEncoderLayer(
-                    output_size,
-                    COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
-                        attention_heads, output_size, attention_dropout_rate, key_bias
-                    ),
-                    PositionwiseFeedForward(
-                        output_size, linear_units, dropout_rate, activation
-                    ),
-                    dropout_rate,
-                    normalize_before,
-                )
-                for _ in range(num_blocks)
-            ]
-        )
-class ConformerEncoder(BaseEncoder):
-    """Conformer encoder module."""
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int = 256,
-        attention_heads: int = 4,
-        linear_units: int = 2048,
-        num_blocks: int = 6,
-        dropout_rate: float = 0.1,
-        positional_dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-        input_layer: str = "conv2d",
-        pos_enc_layer_type: str = "rel_pos",
-        normalize_before: bool = True,
-        static_chunk_size: int = 0,
-        use_dynamic_chunk: bool = False,
-        global_cmvn: torch.nn.Module = None,
-        use_dynamic_left_chunk: bool = False,
-        positionwise_conv_kernel_size: int = 1,
-        macaron_style: bool = True,
-        selfattention_layer_type: str = "rel_selfattn",
-        activation_type: str = "swish",
-        use_cnn_module: bool = True,
-        cnn_module_kernel: int = 15,
-        causal: bool = False,
-        cnn_module_norm: str = "batch_norm",
-        key_bias: bool = True,
-        gradient_checkpointing: bool = False,
-    ):
-        """Construct ConformerEncoder
-        Args:
-            input_size to use_dynamic_chunk, see in BaseEncoder
-            positionwise_conv_kernel_size (int): Kernel size of positionwise
-                conv1d layer.
-            macaron_style (bool): Whether to use macaron style for
-                positionwise layer.
-            selfattention_layer_type (str): Encoder attention layer type,
-                the parameter has no effect now, it's just for configure
-                compatibility.
-            activation_type (str): Encoder activation function type.
-            use_cnn_module (bool): Whether to use convolution module.
-            cnn_module_kernel (int): Kernel size of convolution module.
-            causal (bool): whether to use causal convolution or not.
-            key_bias: whether use bias in attention.linear_k, False for whisper models.
-        """
-        super().__init__(
-            input_size,
-            output_size,
-            attention_heads,
-            linear_units,
-            num_blocks,
-            dropout_rate,
-            positional_dropout_rate,
-            attention_dropout_rate,
-            input_layer,
-            pos_enc_layer_type,
-            normalize_before,
-            static_chunk_size,
-            use_dynamic_chunk,
-            global_cmvn,
-            use_dynamic_left_chunk,
-            gradient_checkpointing,
-        )
-        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
-        # self-attention module definition
-        encoder_selfattn_layer_args = (
-            attention_heads,
-            output_size,
-            attention_dropout_rate,
-            key_bias,
-        )
-        # feed-forward module definition
-        positionwise_layer_args = (
-            output_size,
-            linear_units,
-            dropout_rate,
-            activation,
-        )
-        # convolution module definition
-        convolution_layer_args = (
-            output_size,
-            cnn_module_kernel,
-            activation,
-            cnn_module_norm,
-            causal,
-        )
-        self.encoders = torch.nn.ModuleList(
-            [
-                ConformerEncoderLayer(
-                    output_size,
-                    COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
-                        *encoder_selfattn_layer_args
-                    ),
-                    PositionwiseFeedForward(*positionwise_layer_args),
-                    (
-                        PositionwiseFeedForward(*positionwise_layer_args)
-                        if macaron_style
-                        else None
-                    ),
-                    (
-                        ConvolutionModule(*convolution_layer_args)
-                        if use_cnn_module
-                        else None
-                    ),
-                    dropout_rate,
-                    normalize_before,
-                )
-                for _ in range(num_blocks)
-            ]
-        )
-        self.inference_buffers = {}
-        self.inference_graphs = {}
-    @torch.inference_mode()
-    def capture_inference(self, seq_len_to_capture=[128, 256, 512, 1024]):
-        device = next(self.parameters()).device
-        start_time = time.time()
-        print(
-            f"Start capture_inference for ConformerEncoder, seq_len_to_capture: {seq_len_to_capture}"
-        )
-        for seq_len in seq_len_to_capture:
-            xs = torch.randn(
-                1, seq_len, self._output_size, device=device, dtype=torch.bfloat16
-            )
-            xs_lens = torch.tensor([seq_len], device=device, dtype=torch.int32)
-            decoding_chunk_size = 0
-            num_decoding_left_chunks = -1
-            T = xs.size(1)
-            masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
-            if self.global_cmvn is not None:
-                xs = self.global_cmvn(xs)
-            xs, pos_emb, masks = self.embed(xs, masks)
-            mask_pad = masks  # (B, 1, T/subsample_rate)
-            chunk_masks = add_optional_chunk_mask(
-                xs,
-                masks,
-                self.use_dynamic_chunk,
-                self.use_dynamic_left_chunk,
-                decoding_chunk_size,
-                self.static_chunk_size,
-                num_decoding_left_chunks,
-            )
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                out = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
-            self.inference_graphs[seq_len] = g
-            self.inference_buffers[seq_len] = {
-                "xs": xs,
-                "chunk_masks": chunk_masks,
-                "pos_emb": pos_emb,
-                "mask_pad": mask_pad,
-                "out": out,
-            }
-        end_time = time.time()
-        print(
-            f"Finish capture_inference for ConformerEncoder, time elapsed: {end_time - start_time}"
-        )
-    @torch.inference_mode()
-    def inference(self, xs: torch.Tensor, xs_lens: torch.Tensor):
-        curr_seq_len = xs.shape[1]
-        target_len = None
-        for seq_len in sorted(self.inference_graphs.keys()):
-            if seq_len >= curr_seq_len:
-                target_len = seq_len
-                break
-        if target_len is not None:
-            xs = F.pad(xs, (0, 0, 0, target_len - curr_seq_len), "constant", 0)
-        decoding_chunk_size = 0
-        num_decoding_left_chunks = -1
-        T = xs.size(1)
-        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
-        if self.global_cmvn is not None:
-            xs = self.global_cmvn(xs)
-        xs, pos_emb, masks = self.embed(xs, masks)
-        mask_pad = masks  # (B, 1, T/subsample_rate)
-        chunk_masks = add_optional_chunk_mask(
-            xs,
-            masks,
-            self.use_dynamic_chunk,
-            self.use_dynamic_left_chunk,
-            decoding_chunk_size,
-            self.static_chunk_size,
-            num_decoding_left_chunks,
-        )
-        if target_len is not None:
-            buffer = self.inference_buffers[target_len]
-            buffer["xs"].copy_(xs)
-            buffer["chunk_masks"].copy_(chunk_masks)
-            buffer["pos_emb"].copy_(pos_emb)
-            buffer["mask_pad"].copy_(mask_pad)
-            self.inference_graphs[target_len].replay()
-            out = buffer["out"][:, :curr_seq_len, :]
-        else:
-            out = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
-        if self.normalize_before:
-            out = self.after_norm(out)
-        return out, masks

cosyvoice/transformer/encoder_layer.py DELETED Viewed

@@ -1,237 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
-#               2022 Xingchen Song ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Encoder self-attention layer definition."""
-from typing import Optional, Tuple
-import torch
-from torch import nn
-class TransformerEncoderLayer(nn.Module):
-    """Encoder layer module.
-    Args:
-        size (int): Input dimension.
-        self_attn (torch.nn.Module): Self-attention module instance.
-            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
-            instance can be used as the argument.
-        feed_forward (torch.nn.Module): Feed-forward module instance.
-            `PositionwiseFeedForward`, instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
-        normalize_before (bool):
-            True: use layer_norm before each sub-block.
-            False: to use layer_norm after each sub-block.
-    """
-    def __init__(
-        self,
-        size: int,
-        self_attn: torch.nn.Module,
-        feed_forward: torch.nn.Module,
-        dropout_rate: float,
-        normalize_before: bool = True,
-    ):
-        """Construct an EncoderLayer object."""
-        super().__init__()
-        self.self_attn = self_attn
-        self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, eps=1e-5)
-        self.norm2 = nn.LayerNorm(size, eps=1e-5)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.size = size
-        self.normalize_before = normalize_before
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute encoded features.
-        Args:
-            x (torch.Tensor): (#batch, time, size)
-            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
-                (0, 0, 0) means fake mask.
-            pos_emb (torch.Tensor): just for interface compatibility
-                to ConformerEncoderLayer
-            mask_pad (torch.Tensor): does not used in transformer layer,
-                just for unified api with conformer.
-            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
-                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
-            cnn_cache (torch.Tensor): Convolution cache in conformer layer
-                (#batch=1, size, cache_t2), not used here, it's for interface
-                compatibility to ConformerEncoderLayer.
-        Returns:
-            torch.Tensor: Output tensor (#batch, time, size).
-            torch.Tensor: Mask tensor (#batch, time, time).
-            torch.Tensor: att_cache tensor,
-                (#batch=1, head, cache_t1 + time, d_k * 2).
-            torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
-        """
-        residual = x
-        if self.normalize_before:
-            x = self.norm1(x)
-        x_att, new_att_cache = self.self_attn(
-            x, x, x, mask, pos_emb=pos_emb, cache=att_cache
-        )
-        x = residual + self.dropout(x_att)
-        if not self.normalize_before:
-            x = self.norm1(x)
-        residual = x
-        if self.normalize_before:
-            x = self.norm2(x)
-        x = residual + self.dropout(self.feed_forward(x))
-        if not self.normalize_before:
-            x = self.norm2(x)
-        fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
-        return x, mask, new_att_cache, fake_cnn_cache
-class ConformerEncoderLayer(nn.Module):
-    """Encoder layer module.
-    Args:
-        size (int): Input dimension.
-        self_attn (torch.nn.Module): Self-attention module instance.
-            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
-            instance can be used as the argument.
-        feed_forward (torch.nn.Module): Feed-forward module instance.
-            `PositionwiseFeedForward` instance can be used as the argument.
-        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
-             instance.
-            `PositionwiseFeedForward` instance can be used as the argument.
-        conv_module (torch.nn.Module): Convolution module instance.
-            `ConvlutionModule` instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
-        normalize_before (bool):
-            True: use layer_norm before each sub-block.
-            False: use layer_norm after each sub-block.
-    """
-    def __init__(
-        self,
-        size: int,
-        self_attn: torch.nn.Module,
-        feed_forward: Optional[nn.Module] = None,
-        feed_forward_macaron: Optional[nn.Module] = None,
-        conv_module: Optional[nn.Module] = None,
-        dropout_rate: float = 0.1,
-        normalize_before: bool = True,
-    ):
-        """Construct an EncoderLayer object."""
-        super().__init__()
-        self.self_attn = self_attn
-        self.feed_forward = feed_forward
-        self.feed_forward_macaron = feed_forward_macaron
-        self.conv_module = conv_module
-        self.norm_ff = nn.LayerNorm(size, eps=1e-5)  # for the FNN module
-        self.norm_mha = nn.LayerNorm(size, eps=1e-5)  # for the MHA module
-        if feed_forward_macaron is not None:
-            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
-            self.ff_scale = 0.5
-        else:
-            self.ff_scale = 1.0
-        if self.conv_module is not None:
-            self.norm_conv = nn.LayerNorm(size, eps=1e-5)  # for the CNN module
-            self.norm_final = nn.LayerNorm(
-                size, eps=1e-5
-            )  # for the final output of the block
-        self.dropout = nn.Dropout(dropout_rate)
-        self.size = size
-        self.normalize_before = normalize_before
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: torch.Tensor,
-        pos_emb: torch.Tensor,
-        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute encoded features.
-        Args:
-            x (torch.Tensor): (#batch, time, size)
-            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
-                (0, 0, 0) means fake mask.
-            pos_emb (torch.Tensor): positional encoding, must not be None
-                for ConformerEncoderLayer.
-            mask_pad (torch.Tensor): batch padding mask used for conv module.
-                (#batch, 1，time), (0, 0, 0) means fake mask.
-            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
-                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
-            cnn_cache (torch.Tensor): Convolution cache in conformer layer
-                (#batch=1, size, cache_t2)
-        Returns:
-            torch.Tensor: Output tensor (#batch, time, size).
-            torch.Tensor: Mask tensor (#batch, time, time).
-            torch.Tensor: att_cache tensor,
-                (#batch=1, head, cache_t1 + time, d_k * 2).
-            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
-        """
-        # whether to use macaron style
-        if self.feed_forward_macaron is not None:
-            residual = x
-            if self.normalize_before:
-                x = self.norm_ff_macaron(x)
-            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
-            if not self.normalize_before:
-                x = self.norm_ff_macaron(x)
-        # multi-headed self-attention module
-        residual = x
-        if self.normalize_before:
-            x = self.norm_mha(x)
-        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache)
-        x = residual + self.dropout(x_att)
-        if not self.normalize_before:
-            x = self.norm_mha(x)
-        # convolution module
-        # Fake new cnn cache here, and then change it in conv_module
-        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
-        if self.conv_module is not None:
-            residual = x
-            if self.normalize_before:
-                x = self.norm_conv(x)
-            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
-            x = residual + self.dropout(x)
-            if not self.normalize_before:
-                x = self.norm_conv(x)
-        # feed forward module
-        residual = x
-        if self.normalize_before:
-            x = self.norm_ff(x)
-        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
-        if not self.normalize_before:
-            x = self.norm_ff(x)
-        if self.conv_module is not None:
-            x = self.norm_final(x)
-        return x, mask, new_att_cache, new_cnn_cache

cosyvoice/transformer/label_smoothing_loss.py DELETED Viewed

@@ -1,98 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Label smoothing module."""
-import torch
-from torch import nn
-class LabelSmoothingLoss(nn.Module):
-    """Label-smoothing loss.
-    In a standard CE loss, the label's data distribution is:
-    [0,1,2] ->
-    [
-        [1.0, 0.0, 0.0],
-        [0.0, 1.0, 0.0],
-        [0.0, 0.0, 1.0],
-    ]
-    In the smoothing version CE Loss,some probabilities
-    are taken from the true label prob (1.0) and are divided
-    among other labels.
-    e.g.
-    smoothing=0.1
-    [0,1,2] ->
-    [
-        [0.9, 0.05, 0.05],
-        [0.05, 0.9, 0.05],
-        [0.05, 0.05, 0.9],
-    ]
-    Args:
-        size (int): the number of class
-        padding_idx (int): padding class id which will be ignored for loss
-        smoothing (float): smoothing rate (0.0 means the conventional CE)
-        normalize_length (bool):
-            normalize loss by sequence length if True
-            normalize loss by batch size if False
-    """
-    def __init__(
-        self,
-        size: int,
-        padding_idx: int,
-        smoothing: float,
-        normalize_length: bool = False,
-    ):
-        """Construct an LabelSmoothingLoss object."""
-        super(LabelSmoothingLoss, self).__init__()
-        self.criterion = nn.KLDivLoss(reduction="none")
-        self.padding_idx = padding_idx
-        self.confidence = 1.0 - smoothing
-        self.smoothing = smoothing
-        self.size = size
-        self.normalize_length = normalize_length
-    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        """Compute loss between x and target.
-        The model outputs and data labels tensors are flatten to
-        (batch*seqlen, class) shape and a mask is applied to the
-        padding part which should not be calculated for loss.
-        Args:
-            x (torch.Tensor): prediction (batch, seqlen, class)
-            target (torch.Tensor):
-                target signal masked with self.padding_id (batch, seqlen)
-        Returns:
-            loss (torch.Tensor) : The KL loss, scalar float value
-        """
-        assert x.size(2) == self.size
-        batch_size = x.size(0)
-        x = x.view(-1, self.size)
-        target = target.view(-1)
-        # use zeros_like instead of torch.no_grad() for true_dist,
-        # since no_grad() can not be exported by JIT
-        true_dist = torch.zeros_like(x)
-        true_dist.fill_(self.smoothing / (self.size - 1))
-        ignore = target == self.padding_idx  # (B,)
-        total = len(target) - ignore.sum().item()
-        target = target.masked_fill(ignore, 0)  # avoid -1 index
-        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
-        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
-        denom = total if self.normalize_length else batch_size
-        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom

cosyvoice/transformer/positionwise_feed_forward.py DELETED Viewed

@@ -1,116 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Positionwise feed forward layer definition."""
-import torch
-class PositionwiseFeedForward(torch.nn.Module):
-    """Positionwise feed forward layer.
-    FeedForward are appied on each position of the sequence.
-    The output dim is same with the input dim.
-    Args:
-        idim (int): Input dimenstion.
-        hidden_units (int): The number of hidden units.
-        dropout_rate (float): Dropout rate.
-        activation (torch.nn.Module): Activation function
-    """
-    def __init__(
-        self,
-        idim: int,
-        hidden_units: int,
-        dropout_rate: float,
-        activation: torch.nn.Module = torch.nn.ReLU(),
-    ):
-        """Construct a PositionwiseFeedForward object."""
-        super(PositionwiseFeedForward, self).__init__()
-        self.w_1 = torch.nn.Linear(idim, hidden_units)
-        self.activation = activation
-        self.dropout = torch.nn.Dropout(dropout_rate)
-        self.w_2 = torch.nn.Linear(hidden_units, idim)
-    def forward(self, xs: torch.Tensor) -> torch.Tensor:
-        """Forward function.
-        Args:
-            xs: input tensor (B, L, D)
-        Returns:
-            output tensor, (B, L, D)
-        """
-        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
-class MoEFFNLayer(torch.nn.Module):
-    """
-    Mixture of expert with Positionwise feed forward layer
-    See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
-    The output dim is same with the input dim.
-    Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
-                  https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
-    Args:
-        n_expert: number of expert.
-        n_expert_per_token: The actual number of experts used for each frame
-        idim (int): Input dimenstion.
-        hidden_units (int): The number of hidden units.
-        dropout_rate (float): Dropout rate.
-        activation (torch.nn.Module): Activation function
-    """
-    def __init__(
-        self,
-        n_expert: int,
-        n_expert_per_token: int,
-        idim: int,
-        hidden_units: int,
-        dropout_rate: float,
-        activation: torch.nn.Module = torch.nn.ReLU(),
-    ):
-        super(MoEFFNLayer, self).__init__()
-        self.gate = torch.nn.Linear(idim, n_expert, bias=False)
-        self.experts = torch.nn.ModuleList(
-            PositionwiseFeedForward(idim, hidden_units, dropout_rate, activation)
-            for _ in range(n_expert)
-        )
-        self.n_expert_per_token = n_expert_per_token
-    def forward(self, xs: torch.Tensor) -> torch.Tensor:
-        """Foward function.
-        Args:
-            xs: input tensor (B, L, D)
-        Returns:
-            output tensor, (B, L, D)
-        """
-        B, L, D = xs.size()  # batch size, sequence length, embedding dimension (idim)
-        xs = xs.view(-1, D)  # (B*L, D)
-        router = self.gate(xs)  # (B*L, n_expert)
-        logits, indices = torch.topk(
-            router, self.n_expert_per_token
-        )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
-        weights = torch.nn.functional.softmax(logits, dim=1, dtype=torch.float).to(
-            dtype=xs.dtype
-        )  # (B*L, n_expert_per_token)
-        output = torch.zeros_like(xs)  # (B*L, D)
-        for i, expert in enumerate(self.experts):
-            mask = indices == i
-            batch_idx, ith_expert = torch.where(mask)
-            output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
-                xs[batch_idx]
-            )
-        return output.view(B, L, D)

cosyvoice/transformer/subsampling.py DELETED Viewed

@@ -1,391 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Subsampling layer definition."""
-from typing import Tuple, Union
-import torch
-class BaseSubsampling(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.right_context = 0
-        self.subsampling_rate = 1
-    def position_encoding(
-        self, offset: Union[int, torch.Tensor], size: int
-    ) -> torch.Tensor:
-        return self.pos_enc.position_encoding(offset, size)
-class EmbedinigNoSubsampling(BaseSubsampling):
-    """Embedding input without subsampling"""
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        super().__init__()
-        self.embed = torch.nn.Embedding(idim, odim)
-        self.pos_enc = pos_enc_class
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Input x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: linear input tensor (#batch, time', odim),
-                where time' = time .
-            torch.Tensor: linear input mask (#batch, 1, time'),
-                where time' = time .
-        """
-        x = self.embed(x)
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask
-class LinearNoSubsampling(BaseSubsampling):
-    """Linear transform the input without subsampling
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an linear object."""
-        super().__init__()
-        self.out = torch.nn.Sequential(
-            torch.nn.Linear(idim, odim),
-            torch.nn.LayerNorm(odim, eps=1e-5),
-            torch.nn.Dropout(dropout_rate),
-        )
-        self.pos_enc = pos_enc_class
-        self.right_context = 0
-        self.subsampling_rate = 1
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Input x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: linear input tensor (#batch, time', odim),
-                where time' = time .
-            torch.Tensor: linear input mask (#batch, 1, time'),
-                where time' = time .
-        """
-        x = self.out(x)
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask
-class Conv1dSubsampling2(BaseSubsampling):
-    """Convolutional 1D subsampling (to 1/2 length).
-       It is designed for Whisper, ref:
-       https://github.com/openai/whisper/blob/main/whisper/model.py
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an Conv1dSubsampling2 object."""
-        super().__init__()
-        self.conv = torch.nn.Sequential(
-            torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
-            torch.nn.GELU(),
-            torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
-            torch.nn.GELU(),
-        )
-        self.pos_enc = pos_enc_class
-        # The right context for every conv layer is computed by:
-        # (kernel_size - 1) * frame_rate_of_this_layer
-        self.subsampling_rate = 2
-        # 4 = (3 - 1) * 1 + (3 - 1) * 1
-        self.right_context = 4
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: Subsampled tensor (#batch, time', odim),
-                where time' = time // 2.
-            torch.Tensor: Subsampled mask (#batch, 1, time'),
-                where time' = time // 2.
-            torch.Tensor: positional encoding
-        """
-        time = x.size(1)
-        x = x.transpose(1, 2)  # (b, f, t)
-        x = self.conv(x)
-        x = x.transpose(1, 2)  # (b, t, f)
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask[:, :, (time + 1) % 2 :: 2]
-class Conv2dSubsampling4(BaseSubsampling):
-    """Convolutional 2D subsampling (to 1/4 length).
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an Conv2dSubsampling4 object."""
-        super().__init__()
-        self.conv = torch.nn.Sequential(
-            torch.nn.Conv2d(1, odim, 3, 2),
-            torch.nn.ReLU(),
-            torch.nn.Conv2d(odim, odim, 3, 2),
-            torch.nn.ReLU(),
-        )
-        self.out = torch.nn.Sequential(
-            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
-        )
-        self.pos_enc = pos_enc_class
-        # The right context for every conv layer is computed by:
-        # (kernel_size - 1) * frame_rate_of_this_layer
-        self.subsampling_rate = 4
-        # 6 = (3 - 1) * 1 + (3 - 1) * 2
-        self.right_context = 6
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: Subsampled tensor (#batch, time', odim),
-                where time' = time // 4.
-            torch.Tensor: Subsampled mask (#batch, 1, time'),
-                where time' = time // 4.
-            torch.Tensor: positional encoding
-        """
-        x = x.unsqueeze(1)  # (b, c=1, t, f)
-        x = self.conv(x)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
-class Conv2dSubsampling6(BaseSubsampling):
-    """Convolutional 2D subsampling (to 1/6 length).
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-        pos_enc (torch.nn.Module): Custom position encoding layer.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an Conv2dSubsampling6 object."""
-        super().__init__()
-        self.conv = torch.nn.Sequential(
-            torch.nn.Conv2d(1, odim, 3, 2),
-            torch.nn.ReLU(),
-            torch.nn.Conv2d(odim, odim, 5, 3),
-            torch.nn.ReLU(),
-        )
-        self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
-        self.pos_enc = pos_enc_class
-        # 10 = (3 - 1) * 1 + (5 - 1) * 2
-        self.subsampling_rate = 6
-        self.right_context = 10
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: Subsampled tensor (#batch, time', odim),
-                where time' = time // 6.
-            torch.Tensor: Subsampled mask (#batch, 1, time'),
-                where time' = time // 6.
-            torch.Tensor: positional encoding
-        """
-        x = x.unsqueeze(1)  # (b, c, t, f)
-        x = self.conv(x)
-        b, c, t, f = x.size()
-        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
-class Conv2dSubsampling8(BaseSubsampling):
-    """Convolutional 2D subsampling (to 1/8 length).
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an Conv2dSubsampling8 object."""
-        super().__init__()
-        self.conv = torch.nn.Sequential(
-            torch.nn.Conv2d(1, odim, 3, 2),
-            torch.nn.ReLU(),
-            torch.nn.Conv2d(odim, odim, 3, 2),
-            torch.nn.ReLU(),
-            torch.nn.Conv2d(odim, odim, 3, 2),
-            torch.nn.ReLU(),
-        )
-        self.linear = torch.nn.Linear(
-            odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim
-        )
-        self.pos_enc = pos_enc_class
-        self.subsampling_rate = 8
-        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
-        self.right_context = 14
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: Subsampled tensor (#batch, time', odim),
-                where time' = time // 8.
-            torch.Tensor: Subsampled mask (#batch, 1, time'),
-                where time' = time // 8.
-            torch.Tensor: positional encoding
-        """
-        x = x.unsqueeze(1)  # (b, c, t, f)
-        x = self.conv(x)
-        b, c, t, f = x.size()
-        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
-class LegacyLinearNoSubsampling(BaseSubsampling):
-    """Linear transform the input without subsampling
-    Args:
-        idim (int): Input dimension.
-        odim (int): Output dimension.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(
-        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
-    ):
-        """Construct an linear object."""
-        super().__init__()
-        self.out = torch.nn.Sequential(
-            torch.nn.Linear(idim, odim),
-            torch.nn.LayerNorm(odim, eps=1e-5),
-            torch.nn.Dropout(dropout_rate),
-            torch.nn.ReLU(),
-        )
-        self.pos_enc = pos_enc_class
-        self.right_context = 0
-        self.subsampling_rate = 1
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        offset: Union[int, torch.Tensor] = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Input x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, time, idim).
-            x_mask (torch.Tensor): Input mask (#batch, 1, time).
-        Returns:
-            torch.Tensor: linear input tensor (#batch, time', odim),
-                where time' = time .
-            torch.Tensor: linear input mask (#batch, 1, time'),
-                where time' = time .
-        """
-        x = self.out(x)
-        x, pos_emb = self.pos_enc(x, offset)
-        return x, pos_emb, x_mask

cosyvoice/utils/__init__.py DELETED Viewed

File without changes

cosyvoice/utils/audio.py DELETED Viewed

@@ -1,90 +0,0 @@
-import numpy as np
-import torch
-import torch.utils.data
-from librosa.filters import mel as librosa_mel_fn
-from scipy.io.wavfile import read
-MAX_WAV_VALUE = 32768.0
-def load_wav(full_path):
-    sampling_rate, data = read(full_path)
-    return data, sampling_rate
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
-def dynamic_range_decompression(x, C=1):
-    return np.exp(x) / C
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression_torch(x, C=1):
-    return torch.exp(x) / C
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-def spectral_de_normalize_torch(magnitudes):
-    output = dynamic_range_decompression_torch(magnitudes)
-    return output
-mel_basis = {}
-hann_window = {}
-def mel_spectrogram(
-    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
-):
-    # if torch.min(y) < -1.0:
-    #     print("min value is ", torch.min(y))
-    # if torch.max(y) > 1.0:
-    #     print("max value is ", torch.max(y))
-    global mel_basis, hann_window  # pylint: disable=global-statement
-    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
-        mel = librosa_mel_fn(
-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
-        )
-        mel_basis[str(fmax) + "_" + str(y.device)] = (
-            torch.from_numpy(mel).float().to(y.device)
-        )
-        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_size,
-            window=hann_window[str(y.device)],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec

cosyvoice/utils/class_utils.py DELETED Viewed

@@ -1,78 +0,0 @@
-# Copyright [2023-11-28] <[email protected], Xingchen Song>
-#            2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from cosyvoice.transformer.activation import Swish
-from cosyvoice.transformer.subsampling import (
-    LinearNoSubsampling,
-    EmbedinigNoSubsampling,
-    Conv1dSubsampling2,
-    Conv2dSubsampling4,
-    Conv2dSubsampling6,
-    Conv2dSubsampling8,
-)
-from cosyvoice.transformer.embedding import (
-    PositionalEncoding,
-    RelPositionalEncoding,
-    WhisperPositionalEncoding,
-    LearnablePositionalEncoding,
-    NoPositionalEncoding,
-)
-from cosyvoice.transformer.attention import (
-    MultiHeadedAttention,
-    RelPositionMultiHeadedAttention,
-)
-from cosyvoice.transformer.embedding import (
-    EspnetRelPositionalEncoding,
-)
-from cosyvoice.transformer.subsampling import (
-    LegacyLinearNoSubsampling,
-)
-COSYVOICE_ACTIVATION_CLASSES = {
-    "hardtanh": torch.nn.Hardtanh,
-    "tanh": torch.nn.Tanh,
-    "relu": torch.nn.ReLU,
-    "selu": torch.nn.SELU,
-    "swish": getattr(torch.nn, "SiLU", Swish),
-    "gelu": torch.nn.GELU,
-}
-COSYVOICE_SUBSAMPLE_CLASSES = {
-    "linear": LinearNoSubsampling,
-    "linear_legacy": LegacyLinearNoSubsampling,
-    "embed": EmbedinigNoSubsampling,
-    "conv1d2": Conv1dSubsampling2,
-    "conv2d": Conv2dSubsampling4,
-    "conv2d6": Conv2dSubsampling6,
-    "conv2d8": Conv2dSubsampling8,
-    "paraformer_dummy": torch.nn.Identity,
-}
-COSYVOICE_EMB_CLASSES = {
-    "embed": PositionalEncoding,
-    "abs_pos": PositionalEncoding,
-    "rel_pos": RelPositionalEncoding,
-    "rel_pos_espnet": EspnetRelPositionalEncoding,
-    "no_pos": NoPositionalEncoding,
-    "abs_pos_whisper": WhisperPositionalEncoding,
-    "embed_learnable_pe": LearnablePositionalEncoding,
-}
-COSYVOICE_ATTENTION_CLASSES = {
-    "selfattn": MultiHeadedAttention,
-    "rel_selfattn": RelPositionMultiHeadedAttention,
-}

cosyvoice/utils/common.py DELETED Viewed

@@ -1,169 +0,0 @@
-# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-"""Unility functions for Transformer."""
-import random
-from typing import List
-import numpy as np
-import torch
-IGNORE_ID = -1
-def pad_list(xs: List[torch.Tensor], pad_value: int):
-    """Perform padding for the list of tensors.
-    Args:
-        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-        pad_value (float): Value for padding.
-    Returns:
-        Tensor: Padded tensor (B, Tmax, `*`).
-    Examples:
-        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
-        >>> x
-        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
-        >>> pad_list(x, 0)
-        tensor([[1., 1., 1., 1.],
-                [1., 1., 0., 0.],
-                [1., 0., 0., 0.]])
-    """
-    max_len = max([len(item) for item in xs])
-    batchs = len(xs)
-    ndim = xs[0].ndim
-    if ndim == 1:
-        pad_res = torch.zeros(batchs, max_len, dtype=xs[0].dtype, device=xs[0].device)
-    elif ndim == 2:
-        pad_res = torch.zeros(
-            batchs, max_len, xs[0].shape[1], dtype=xs[0].dtype, device=xs[0].device
-        )
-    elif ndim == 3:
-        pad_res = torch.zeros(
-            batchs,
-            max_len,
-            xs[0].shape[1],
-            xs[0].shape[2],
-            dtype=xs[0].dtype,
-            device=xs[0].device,
-        )
-    else:
-        raise ValueError(f"Unsupported ndim: {ndim}")
-    pad_res.fill_(pad_value)
-    for i in range(batchs):
-        pad_res[i, : len(xs[i])] = xs[i]
-    return pad_res
-def th_accuracy(
-    pad_outputs: torch.Tensor, pad_targets: torch.Tensor, ignore_label: int
-) -> torch.Tensor:
-    """Calculate accuracy.
-    Args:
-        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
-        pad_targets (LongTensor): Target label tensors (B, Lmax).
-        ignore_label (int): Ignore label id.
-    Returns:
-        torch.Tensor: Accuracy value (0.0 - 1.0).
-    """
-    pad_pred = pad_outputs.view(
-        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)
-    ).argmax(2)
-    mask = pad_targets != ignore_label
-    numerator = torch.sum(
-        pad_pred.masked_select(mask) == pad_targets.masked_select(mask)
-    )
-    denominator = torch.sum(mask)
-    return (numerator / denominator).detach()
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-# Repetition Aware Sampling in VALL-E 2
-def ras_sampling(
-    weighted_scores,
-    decoded_tokens,
-    sampling,
-    top_p=0.8,
-    top_k=25,
-    win_size=10,
-    tau_r=0.1,
-):
-    top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
-    rep_num = (
-        (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids)
-        .sum()
-        .item()
-    )
-    if rep_num >= win_size * tau_r:
-        top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)
-    return top_ids
-def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
-    prob, indices = [], []
-    cum_prob = 0.0
-    sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(
-        descending=True, stable=True
-    )
-    for i in range(len(sorted_idx)):
-        # sampling both top-p and numbers.
-        if cum_prob < top_p and len(prob) < top_k:
-            cum_prob += sorted_value[i]
-            prob.append(sorted_value[i])
-            indices.append(sorted_idx[i])
-        else:
-            break
-    prob = torch.tensor(prob).to(weighted_scores)
-    indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
-    top_ids = indices[prob.multinomial(1, replacement=True)]
-    return top_ids
-def random_sampling(weighted_scores, decoded_tokens, sampling):
-    top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
-    return top_ids
-def fade_in_out(fade_in_mel, fade_out_mel, window):
-    device = fade_in_mel.device
-    fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
-    mel_overlap_len = int(window.shape[0] / 2)
-    fade_in_mel[..., :mel_overlap_len] = (
-        fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len]
-        + fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
-    )
-    return fade_in_mel.to(device)
-def set_all_random_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)

cosyvoice/utils/executor.py DELETED Viewed

@@ -1,151 +0,0 @@
-# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-from contextlib import nullcontext
-import os
-import torch
-import torch.distributed as dist
-from cosyvoice.utils.train_utils import (
-    update_parameter_and_lr,
-    log_per_step,
-    log_per_save,
-    batch_forward,
-    batch_backward,
-    save_model,
-    cosyvoice_join,
-)
-class Executor:
-    def __init__(self):
-        self.step = 0
-        self.epoch = 0
-        self.rank = int(os.environ.get("RANK", 0))
-        self.device = torch.device("cuda:{}".format(self.rank))
-    def train_one_epoc(
-        self,
-        model,
-        optimizer,
-        scheduler,
-        train_data_loader,
-        cv_data_loader,
-        writer,
-        info_dict,
-        group_join,
-    ):
-        """Train one epoch"""
-        lr = optimizer.param_groups[0]["lr"]
-        logging.info(
-            "Epoch {} TRAIN info lr {} rank {}".format(self.epoch, lr, self.rank)
-        )
-        logging.info(
-            "using accumulate grad, new batch size is {} times"
-            " larger than before".format(info_dict["accum_grad"])
-        )
-        # A context manager to be used in conjunction with an instance of
-        # torch.nn.parallel.DistributedDataParallel to be able to train
-        # with uneven inputs across participating processes.
-        model.train()
-        model_context = (
-            model.join if info_dict["train_engine"] == "torch_ddp" else nullcontext
-        )
-        with model_context():
-            for batch_idx, batch_dict in enumerate(train_data_loader):
-                info_dict["tag"] = "TRAIN"
-                info_dict["step"] = self.step
-                info_dict["epoch"] = self.epoch
-                info_dict["batch_idx"] = batch_idx
-                if cosyvoice_join(group_join, info_dict):
-                    break
-                # Disable gradient synchronizations across DDP processes.
-                # Within this context, gradients will be accumulated on module
-                # variables, which will later be synchronized.
-                if (
-                    info_dict["train_engine"] == "torch_ddp"
-                    and (batch_idx + 1) % info_dict["accum_grad"] != 0
-                ):
-                    context = model.no_sync
-                # Used for single gpu training and DDP gradient synchronization
-                # processes.
-                else:
-                    context = nullcontext
-                with context():
-                    info_dict = batch_forward(model, batch_dict, info_dict)
-                    info_dict = batch_backward(model, info_dict)
-                info_dict = update_parameter_and_lr(
-                    model, optimizer, scheduler, info_dict
-                )
-                log_per_step(writer, info_dict)
-                # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
-                if (
-                    info_dict["save_per_step"] > 0
-                    and (self.step + 1) % info_dict["save_per_step"] == 0
-                    and (batch_idx + 1) % info_dict["accum_grad"] == 0
-                ):
-                    dist.barrier()
-                    self.cv(
-                        model, cv_data_loader, writer, info_dict, on_batch_end=False
-                    )
-                    model.train()
-                if (batch_idx + 1) % info_dict["accum_grad"] == 0:
-                    self.step += 1
-        dist.barrier()
-        self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
-    @torch.inference_mode()
-    def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True):
-        """Cross validation on"""
-        logging.info(
-            "Epoch {} Step {} on_batch_end {} CV rank {}".format(
-                self.epoch, self.step + 1, on_batch_end, self.rank
-            )
-        )
-        model.eval()
-        total_num_utts, total_loss_dict = 0, {}  # avoid division by 0
-        for batch_idx, batch_dict in enumerate(cv_data_loader):
-            info_dict["tag"] = "CV"
-            info_dict["step"] = self.step
-            info_dict["epoch"] = self.epoch
-            info_dict["batch_idx"] = batch_idx
-            num_utts = len(batch_dict["utts"])
-            total_num_utts += num_utts
-            info_dict = batch_forward(model, batch_dict, info_dict)
-            for k, v in info_dict["loss_dict"].items():
-                if k not in total_loss_dict:
-                    total_loss_dict[k] = []
-                total_loss_dict[k].append(v.item() * num_utts)
-            log_per_step(None, info_dict)
-        for k, v in total_loss_dict.items():
-            total_loss_dict[k] = sum(v) / total_num_utts
-        info_dict["loss_dict"] = total_loss_dict
-        log_per_save(writer, info_dict)
-        model_name = (
-            "epoch_{}_whole".format(self.epoch)
-            if on_batch_end
-            else "epoch_{}_step_{}".format(self.epoch, self.step + 1)
-        )
-        save_model(model, model_name, info_dict)

cosyvoice/utils/file_utils.py DELETED Viewed

@@ -1,49 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import torchaudio
-import logging
-logging.getLogger("matplotlib").setLevel(logging.WARNING)
-logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s")
-def read_lists(list_file):
-    lists = []
-    with open(list_file, "r", encoding="utf8") as fin:
-        for line in fin:
-            lists.append(line.strip())
-    return lists
-def read_json_lists(list_file):
-    lists = read_lists(list_file)
-    results = {}
-    for fn in lists:
-        with open(fn, "r", encoding="utf8") as fin:
-            results.update(json.load(fin))
-    return results
-def load_wav(wav, target_sr):
-    speech, sample_rate = torchaudio.load(wav)
-    speech = speech.mean(dim=0, keepdim=True)
-    if sample_rate != target_sr:
-        # assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
-        speech = torchaudio.transforms.Resample(
-            orig_freq=sample_rate, new_freq=target_sr
-        )(speech)
-    return speech

cosyvoice/utils/frontend_utils.py DELETED Viewed

@@ -1,142 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import re
-chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
-# whether contain chinese character
-def contains_chinese(text):
-    return bool(chinese_char_pattern.search(text))
-# replace special symbol
-def replace_corner_mark(text):
-    text = text.replace("²", "平方")
-    text = text.replace("³", "立方")
-    return text
-# remove meaningless symbol
-def remove_bracket(text):
-    text = text.replace("（", "").replace("）", "")
-    text = text.replace("【", "").replace("】", "")
-    text = text.replace("`", "").replace("`", "")
-    text = text.replace("——", " ")
-    return text
-# spell Arabic numerals
-def spell_out_number(text: str, inflect_parser):
-    new_text = []
-    st = None
-    for i, c in enumerate(text):
-        if not c.isdigit():
-            if st is not None:
-                num_str = inflect_parser.number_to_words(text[st:i])
-                new_text.append(num_str)
-                st = None
-            new_text.append(c)
-        else:
-            if st is None:
-                st = i
-    if st is not None and st < len(text):
-        num_str = inflect_parser.number_to_words(text[st:])
-        new_text.append(num_str)
-    return "".join(new_text)
-# split paragrah logic：
-# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
-# 2. cal sentence len according to lang
-# 3. split sentence according to puncatation
-def split_paragraph(
-    text: str,
-    tokenize,
-    lang="zh",
-    token_max_n=80,
-    token_min_n=60,
-    merge_len=20,
-    comma_split=False,
-):
-    def calc_utt_length(_text: str):
-        if lang == "zh":
-            return len(_text)
-        else:
-            return len(tokenize(_text))
-    def should_merge(_text: str):
-        if lang == "zh":
-            return len(_text) < merge_len
-        else:
-            return len(tokenize(_text)) < merge_len
-    if lang == "zh":
-        pounc = ["。", "？", "！", "；", "：", "、", ".", "?", "!", ";"]
-    else:
-        pounc = [".", "?", "!", ";", ":"]
-    if comma_split:
-        pounc.extend(["，", ","])
-    if text[-1] not in pounc:
-        if lang == "zh":
-            text += "。"
-        else:
-            text += "."
-    st = 0
-    utts = []
-    for i, c in enumerate(text):
-        if c in pounc:
-            if len(text[st:i]) > 0:
-                utts.append(text[st:i] + c)
-            if i + 1 < len(text) and text[i + 1] in ['"', "”"]:
-                tmp = utts.pop(-1)
-                utts.append(tmp + text[i + 1])
-                st = i + 2
-            else:
-                st = i + 1
-    final_utts = []
-    cur_utt = ""
-    for utt in utts:
-        if (
-            calc_utt_length(cur_utt + utt) > token_max_n
-            and calc_utt_length(cur_utt) > token_min_n
-        ):
-            final_utts.append(cur_utt)
-            cur_utt = ""
-        cur_utt = cur_utt + utt
-    if len(cur_utt) > 0:
-        if should_merge(cur_utt) and len(final_utts) != 0:
-            final_utts[-1] = final_utts[-1] + cur_utt
-        else:
-            final_utts.append(cur_utt)
-    return final_utts
-# remove blank between chinese character
-def replace_blank(text: str):
-    out_str = []
-    for i, c in enumerate(text):
-        if c == " ":
-            if (text[i + 1].isascii() and text[i + 1] != " ") and (
-                text[i - 1].isascii() and text[i - 1] != " "
-            ):
-                out_str.append(c)
-        else:
-            out_str.append(c)
-    return "".join(out_str)

cosyvoice/utils/mask.py DELETED Viewed

@@ -1,226 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-'''
-def subsequent_mask(
-        size: int,
-        device: torch.device = torch.device("cpu"),
-) -> torch.Tensor:
-    """Create mask for subsequent steps (size, size).
-    This mask is used only in decoder which works in an auto-regressive mode.
-    This means the current step could only do attention with its left steps.
-    In encoder, fully attention is used when streaming is not necessary and
-    the sequence is not long. In this  case, no attention mask is needed.
-    When streaming is need, chunk-based attention is used in encoder. See
-    subsequent_chunk_mask for the chunk-based attention mask.
-    Args:
-        size (int): size of mask
-        str device (str): "cpu" or "cuda" or torch.Tensor.device
-        dtype (torch.device): result dtype
-    Returns:
-        torch.Tensor: mask
-    Examples:
-        >>> subsequent_mask(3)
-        [[1, 0, 0],
-         [1, 1, 0],
-         [1, 1, 1]]
-    """
-    ret = torch.ones(size, size, device=device, dtype=torch.bool)
-    return torch.tril(ret)
-'''
-def subsequent_mask(
-    size: int,
-    device: torch.device = torch.device("cpu"),
-) -> torch.Tensor:
-    """Create mask for subsequent steps (size, size).
-    This mask is used only in decoder which works in an auto-regressive mode.
-    This means the current step could only do attention with its left steps.
-    In encoder, fully attention is used when streaming is not necessary and
-    the sequence is not long. In this  case, no attention mask is needed.
-    When streaming is need, chunk-based attention is used in encoder. See
-    subsequent_chunk_mask for the chunk-based attention mask.
-    Args:
-        size (int): size of mask
-        str device (str): "cpu" or "cuda" or torch.Tensor.device
-        dtype (torch.device): result dtype
-    Returns:
-        torch.Tensor: mask
-    Examples:
-        >>> subsequent_mask(3)
-        [[1, 0, 0],
-         [1, 1, 0],
-         [1, 1, 1]]
-    """
-    arange = torch.arange(size, device=device)
-    mask = arange.expand(size, size)
-    arange = arange.unsqueeze(-1)
-    mask = mask <= arange
-    return mask
-def subsequent_chunk_mask(
-    size: int,
-    chunk_size: int,
-    num_left_chunks: int = -1,
-    device: torch.device = torch.device("cpu"),
-) -> torch.Tensor:
-    """Create mask for subsequent steps (size, size) with chunk size,
-       this is for streaming encoder
-    Args:
-        size (int): size of mask
-        chunk_size (int): size of chunk
-        num_left_chunks (int): number of left chunks
-            <0: use full chunk
-            >=0: use num_left_chunks
-        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
-    Returns:
-        torch.Tensor: mask
-    Examples:
-        >>> subsequent_chunk_mask(4, 2)
-        [[1, 1, 0, 0],
-         [1, 1, 0, 0],
-         [1, 1, 1, 1],
-         [1, 1, 1, 1]]
-    """
-    ret = torch.zeros(size, size, device=device, dtype=torch.bool)
-    for i in range(size):
-        if num_left_chunks < 0:
-            start = 0
-        else:
-            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
-        ending = min((i // chunk_size + 1) * chunk_size, size)
-        ret[i, start:ending] = True
-    return ret
-def add_optional_chunk_mask(
-    xs: torch.Tensor,
-    masks: torch.Tensor,
-    use_dynamic_chunk: bool,
-    use_dynamic_left_chunk: bool,
-    decoding_chunk_size: int,
-    static_chunk_size: int,
-    num_decoding_left_chunks: int,
-    enable_full_context: bool = True,
-):
-    """Apply optional mask for encoder.
-    Args:
-        xs (torch.Tensor): padded input, (B, L, D), L for max length
-        mask (torch.Tensor): mask for xs, (B, 1, L)
-        use_dynamic_chunk (bool): whether to use dynamic chunk or not
-        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
-            training.
-        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
-            0: default for training, use random dynamic chunk.
-            <0: for decoding, use full chunk.
-            >0: for decoding, use fixed chunk size as set.
-        static_chunk_size (int): chunk size for static chunk training/decoding
-            if it's greater than 0, if use_dynamic_chunk is true,
-            this parameter will be ignored
-        num_decoding_left_chunks: number of left chunks, this is for decoding,
-            the chunk size is decoding_chunk_size.
-            >=0: use num_decoding_left_chunks
-            <0: use all left chunks
-        enable_full_context (bool):
-            True: chunk size is either [1, 25] or full context(max_len)
-            False: chunk size ~ U[1, 25]
-    Returns:
-        torch.Tensor: chunk mask of the input xs.
-    """
-    # Whether to use chunk mask or not
-    if use_dynamic_chunk:
-        max_len = xs.size(1)
-        if decoding_chunk_size < 0:
-            chunk_size = max_len
-            num_left_chunks = -1
-        elif decoding_chunk_size > 0:
-            chunk_size = decoding_chunk_size
-            num_left_chunks = num_decoding_left_chunks
-        else:
-            # chunk size is either [1, 25] or full context(max_len).
-            # Since we use 4 times subsampling and allow up to 1s(100 frames)
-            # delay, the maximum frame is 100 / 4 = 25.
-            chunk_size = torch.randint(1, max_len, (1,)).item()
-            num_left_chunks = -1
-            if chunk_size > max_len // 2 and enable_full_context:
-                chunk_size = max_len
-            else:
-                chunk_size = chunk_size % 25 + 1
-                if use_dynamic_left_chunk:
-                    max_left_chunks = (max_len - 1) // chunk_size
-                    num_left_chunks = torch.randint(0, max_left_chunks, (1,)).item()
-        chunk_masks = subsequent_chunk_mask(
-            xs.size(1), chunk_size, num_left_chunks, xs.device
-        )  # (L, L)
-        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        chunk_masks = masks & chunk_masks  # (B, L, L)
-    elif static_chunk_size > 0:
-        num_left_chunks = num_decoding_left_chunks
-        chunk_masks = subsequent_chunk_mask(
-            xs.size(1), static_chunk_size, num_left_chunks, xs.device
-        )  # (L, L)
-        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        chunk_masks = masks & chunk_masks  # (B, L, L)
-    else:
-        chunk_masks = masks
-    return chunk_masks
-def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
-    """Make mask tensor containing indices of padded part.
-    See description of make_non_pad_mask.
-    Args:
-        lengths (torch.Tensor): Batch of lengths (B,).
-    Returns:
-        torch.Tensor: Mask tensor containing indices of padded part.
-    Examples:
-        >>> lengths = [5, 3, 2]
-        >>> make_pad_mask(lengths)
-        masks = [[0, 0, 0, 0 ,0],
-                 [0, 0, 0, 1, 1],
-                 [0, 0, 1, 1, 1]]
-    """
-    batch_size = lengths.size(0)
-    max_len = max_len if max_len > 0 else lengths.max().item()
-    seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
-    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
-    seq_length_expand = lengths.unsqueeze(-1)
-    mask = seq_range_expand >= seq_length_expand
-    return mask

cosyvoice/utils/scheduler.py DELETED Viewed

@@ -1,761 +0,0 @@
-# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
-#               2022 Ximalaya Inc (Yuguang Yang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from ESPnet(https://github.com/espnet/espnet)
-#               NeMo(https://github.com/NVIDIA/NeMo)
-from typing import Union
-import math
-import warnings
-import torch
-from torch.optim.lr_scheduler import _LRScheduler
-class WarmupLR(_LRScheduler):
-    """The WarmupLR scheduler
-    This scheduler is almost same as NoamLR Scheduler except for following
-    difference:
-    NoamLR:
-        lr = optimizer.lr * model_size ** -0.5
-             * min(step ** -0.5, step * warmup_step ** -1.5)
-    WarmupLR:
-        lr = optimizer.lr * warmup_step ** 0.5
-             * min(step ** -0.5, step * warmup_step ** -1.5)
-    Note that the maximum lr equals to optimizer.lr in this scheduler.
-    """
-    def __init__(
-        self,
-        optimizer: torch.optim.Optimizer,
-        warmup_steps: Union[int, float] = 25000,
-        last_epoch: int = -1,
-    ):
-        self.warmup_steps = warmup_steps
-        # __init__() must be invoked before setting field
-        # because step() is also invoked in __init__()
-        super().__init__(optimizer, last_epoch)
-    def __repr__(self):
-        return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
-    def get_lr(self):
-        step_num = self.last_epoch + 1
-        if self.warmup_steps == 0:
-            return [lr * step_num**-0.5 for lr in self.base_lrs]
-        else:
-            return [
-                lr
-                * self.warmup_steps**0.5
-                * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
-                for lr in self.base_lrs
-            ]
-    def set_step(self, step: int):
-        self.last_epoch = step
-class WarmupPolicy(_LRScheduler):
-    """Adds warmup kwargs and warmup logic to lr policy.
-    All arguments should be passed as kwargs for clarity,
-    Args:
-        warmup_steps: Number of training steps in warmup stage
-        warmup_ratio: Ratio of warmup steps to total steps
-        max_steps: Total number of steps while training or `None` for
-            infinite training
-    """
-    def __init__(
-        self,
-        optimizer,
-        *,
-        warmup_steps=None,
-        warmup_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        assert not (
-            warmup_steps is not None and warmup_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert (
-            warmup_ratio is None or max_steps is not None
-        ), "If there is a ratio, there should be a total steps"
-        # It is necessary to assign all attributes *before* __init__,
-        # as class is wrapped by an inner class.
-        self.max_steps = max_steps
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            self.warmup_steps = int(warmup_ratio * max_steps)
-        else:
-            self.warmup_steps = 0
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed "
-                "by the scheduler, please use `get_last_lr()`.",
-                UserWarning,
-                stacklevel=2,
-            )
-        step = self.last_epoch
-        if step <= self.warmup_steps and self.warmup_steps > 0:
-            return self._get_warmup_lr(step)
-        if step > self.max_steps:
-            return [self.min_lr for _ in self.base_lrs]
-        return self._get_lr(step)
-    def _get_warmup_lr(self, step):
-        lr_val = (step + 1) / (self.warmup_steps + 1)
-        return [initial_lr * lr_val for initial_lr in self.base_lrs]
-    def _get_lr(self, step):
-        """Simple const lr policy"""
-        return self.base_lrs
-class SquareRootConstantPolicy(_LRScheduler):
-    """Adds warmup kwargs and warmup logic to lr policy.
-    All arguments should be passed as kwargs for clarity,
-    Args:
-        warmup_steps: Number of training steps in warmup stage
-        warmup_ratio: Ratio of warmup steps to total steps
-        max_steps: Total number of steps while training or `None` for
-            infinite training
-    """
-    def __init__(
-        self,
-        optimizer,
-        *,
-        constant_steps=None,
-        constant_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        assert not (
-            constant_steps is not None and constant_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert (
-            constant_ratio is None or max_steps is not None
-        ), "If there is a ratio, there should be a total steps"
-        # It is necessary to assign all attributes *before* __init__,
-        # as class is wrapped by an inner class.
-        self.max_steps = max_steps
-        if constant_steps is not None:
-            self.constant_steps = constant_steps
-        elif constant_ratio is not None:
-            self.constant_steps = int(constant_ratio * max_steps)
-        else:
-            self.constant_steps = 0
-        self.constant_lr = 1 / (constant_steps**0.5)
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed "
-                "by the scheduler, please use `get_last_lr()`.",
-                UserWarning,
-                stacklevel=2,
-            )
-        step = self.last_epoch
-        if step <= self.constant_steps:
-            return [self.constant_lr for _ in self.base_lrs]
-        if step > self.max_steps:
-            return [self.min_lr for _ in self.base_lrs]
-        return self._get_lr(step)
-    def _get_lr(self, step):
-        """Simple const lr policy"""
-        return self.base_lrs
-class WarmupHoldPolicy(WarmupPolicy):
-    """Variant of WarmupPolicy which maintains high
-       learning rate for a defined number of steps.
-    All arguments should be passed as kwargs for clarity,
-    Args:
-        warmup_steps: Number of training steps in warmup stage
-        warmup_ratio: Ratio of warmup steps to total steps
-        hold_steps: Number of training steps to
-                    hold the learning rate after warm up
-        hold_ratio: Ratio of hold steps to total steps
-        max_steps: Total number of steps while training or `None` for
-            infinite training
-    """
-    def __init__(
-        self,
-        optimizer,
-        *,
-        warmup_steps=None,
-        warmup_ratio=None,
-        hold_steps=None,
-        hold_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        assert not (
-            hold_steps is not None and hold_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert (
-            hold_ratio is None or max_steps is not None
-        ), "If there is a ratio, there should be a total steps"
-        self.min_lr = min_lr
-        self._last_warmup_lr = 0.0
-        # Necessary to duplicate as class attributes are hidden in inner class
-        self.max_steps = max_steps
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            self.warmup_steps = int(warmup_ratio * max_steps)
-        else:
-            self.warmup_steps = 0
-        if hold_steps is not None:
-            self.hold_steps = hold_steps + self.warmup_steps
-        elif hold_ratio is not None:
-            self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps
-        else:
-            self.hold_steps = 0
-        super().__init__(
-            optimizer,
-            warmup_steps=warmup_steps,
-            warmup_ratio=warmup_ratio,
-            max_steps=max_steps,
-            last_epoch=last_epoch,
-            min_lr=min_lr,
-        )
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed by the scheduler,"
-                " "
-                "please use `get_last_lr()`.",
-                UserWarning,
-                stacklevel=2,
-            )
-        step = self.last_epoch
-        # Warmup phase
-        if step <= self.warmup_steps and self.warmup_steps > 0:
-            return self._get_warmup_lr(step)
-        # Hold phase
-        if (step >= self.warmup_steps) and (step < self.hold_steps):
-            return self.base_lrs
-        if step > self.max_steps:
-            return [self.min_lr for _ in self.base_lrs]
-        return self._get_lr(step)
-class WarmupAnnealHoldPolicy(_LRScheduler):
-    """Adds warmup kwargs and warmup logic to lr policy.
-    All arguments should be passed as kwargs for clarity,
-    Args:
-        warmup_steps: Number of training steps in warmup stage
-        warmup_ratio: Ratio of warmup steps to total steps
-        max_steps: Total number of steps while training or `None` for
-            infinite training
-        min_lr: Minimum lr to hold the learning rate after decay at.
-        constant_steps: Number of steps to keep lr constant at.
-        constant_ratio: Ratio of steps to keep lr constant.
-    """
-    def __init__(
-        self,
-        optimizer,
-        *,
-        warmup_steps=None,
-        warmup_ratio=None,
-        constant_steps=None,
-        constant_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        assert not (
-            warmup_steps is not None and warmup_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert not (
-            constant_steps is not None and constant_ratio is not None
-        ), "Either use constant_steps or constant_ratio"
-        assert (
-            warmup_ratio is None or max_steps is not None
-        ), "If there is a ratio, there should be a total steps"
-        # It is necessary to assign all attributes *before* __init__,
-        # as class is wrapped by an inner class.
-        self.max_steps = max_steps
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            self.warmup_steps = int(warmup_ratio * max_steps)
-        else:
-            self.warmup_steps = 0
-        if constant_steps is not None:
-            self.constant_steps = constant_steps
-        elif constant_ratio is not None:
-            self.constant_steps = int(constant_ratio * max_steps)
-        else:
-            self.constant_steps = 0
-        self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps)
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed "
-                "by the scheduler, please use `get_last_lr()`.",
-                UserWarning,
-                stacklevel=2,
-            )
-        step = self.last_epoch
-        # Warmup steps
-        if self.warmup_steps > 0 and step <= self.warmup_steps:
-            return self._get_warmup_lr(step)
-        # Constant steps after warmup and decay
-        if (
-            self.constant_steps > 0
-            and (self.warmup_steps + self.decay_steps) < step <= self.max_steps
-        ):
-            return self._get_constant_lr(step)
-        # Min lr after max steps of updates
-        if step > self.max_steps:
-            return [self.min_lr for _ in self.base_lrs]
-        return self._get_lr(step)
-    def _get_warmup_lr(self, step):
-        lr_val = (step + 1) / (self.warmup_steps + 1)
-        return [initial_lr * lr_val for initial_lr in self.base_lrs]
-    def _get_constant_lr(self, step):
-        return [self.min_lr for _ in self.base_lrs]
-    def _get_lr(self, step):
-        """Simple const lr policy"""
-        return self.base_lrs
-def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
-    mult = ((max_steps - step) / max_steps) ** 0.5
-    out_lr = initial_lr * mult
-    out_lr = max(out_lr, min_lr)
-    return out_lr
-def _square_annealing(initial_lr, step, max_steps, min_lr):
-    mult = ((max_steps - step) / max_steps) ** 2
-    out_lr = initial_lr * mult
-    out_lr = max(out_lr, min_lr)
-    return out_lr
-def _cosine_annealing(initial_lr, step, max_steps, min_lr):
-    mult = 0.5 * (1 + math.cos(math.pi * step / max_steps))
-    out_lr = (initial_lr - min_lr) * mult + min_lr
-    return out_lr
-def _linear_warmup_with_cosine_annealing(
-    max_lr, warmup_steps, step, decay_steps, min_lr
-):
-    assert max_lr > min_lr
-    # Use linear warmup for the initial part.
-    if warmup_steps > 0 and step <= warmup_steps:
-        return max_lr * float(step) / float(warmup_steps)
-    # For any steps larger than `decay_steps`, use `min_lr`.
-    if step > warmup_steps + decay_steps:
-        return min_lr
-    # If we are done with the warmup period, use the decay style.
-    num_steps_ = step - warmup_steps
-    decay_steps_ = decay_steps
-    decay_ratio = float(num_steps_) / float(decay_steps_)
-    assert decay_ratio >= 0.0
-    assert decay_ratio <= 1.0
-    delta_lr = max_lr - min_lr
-    coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-    return min_lr + coeff * delta_lr
-def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
-    if cycle:
-        multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps)
-        decay_steps *= multiplier
-    else:
-        step = min(step, decay_steps)
-    p = step / decay_steps
-    lr = (initial_lr - min_lr) * math.pow(1.0 - p, power)
-    lr += min_lr
-    return lr
-def _noam_hold_annealing(
-    initial_lr, step, warmup_steps, hold_steps, decay_rate, min_lr
-):
-    # hold_steps = total number of steps
-    # to hold the LR, not the warmup + hold steps.
-    T_warmup_decay = max(1, warmup_steps**decay_rate)
-    T_hold_decay = max(1, (step - hold_steps) ** decay_rate)
-    lr = (initial_lr * T_warmup_decay) / T_hold_decay
-    lr = max(lr, min_lr)
-    return lr
-class SquareAnnealing(WarmupPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, **kwargs):
-        super().__init__(
-            optimizer=optimizer,
-            max_steps=max_steps,
-            last_epoch=last_epoch,
-            min_lr=min_lr,
-            **kwargs,
-        )
-    def _get_lr(self, step):
-        new_lrs = [
-            _square_annealing(
-                initial_lr=initial_lr,
-                step=step - self.warmup_steps,
-                max_steps=self.max_steps - self.warmup_steps,
-                min_lr=self.min_lr,
-            )
-            for initial_lr in self.base_lrs
-        ]
-        return new_lrs
-class SquareRootAnnealing(WarmupPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(
-            optimizer=optimizer,
-            max_steps=max_steps,
-            last_epoch=last_epoch,
-            min_lr=min_lr,
-            **kwargs,
-        )
-    def _get_lr(self, step):
-        new_lrs = [
-            _squareroot_annealing(
-                initial_lr=initial_lr,
-                step=step,
-                max_steps=self.max_steps,
-                min_lr=self.min_lr,
-            )
-            for initial_lr in self.base_lrs
-        ]
-        return new_lrs
-class CosineAnnealing(WarmupAnnealHoldPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(
-            optimizer=optimizer,
-            max_steps=max_steps,
-            last_epoch=last_epoch,
-            min_lr=min_lr,
-            **kwargs,
-        )
-    def _get_lr(self, step):
-        for initial_lr in self.base_lrs:
-            if initial_lr < self.min_lr:
-                raise ValueError(
-                    f"{self} received an initial learning rate "
-                    f"that was lower than the minimum learning rate."
-                )
-        if self.constant_steps is None or self.constant_steps == 0:
-            new_lrs = [
-                _cosine_annealing(
-                    initial_lr=initial_lr,
-                    step=step - self.warmup_steps,
-                    max_steps=self.max_steps - self.warmup_steps,
-                    min_lr=self.min_lr,
-                )
-                for initial_lr in self.base_lrs
-            ]
-        else:
-            new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step)
-        return new_lrs
-    def _get_warmup_lr(self, step):
-        if self.constant_steps is None or self.constant_steps == 0:
-            return super()._get_warmup_lr(step)
-        else:
-            # Use linear warmup for the initial part.
-            return self._get_linear_warmup_with_cosine_annealing_lr(step)
-    def _get_constant_lr(self, step):
-        # Only called when `constant_steps` > 0.
-        return self._get_linear_warmup_with_cosine_annealing_lr(step)
-    def _get_linear_warmup_with_cosine_annealing_lr(self, step):
-        # Cosine Schedule for Megatron LM,
-        # slightly different warmup schedule + constant LR at the end.
-        new_lrs = [
-            _linear_warmup_with_cosine_annealing(
-                max_lr=self.base_lrs[0],
-                warmup_steps=self.warmup_steps,
-                step=step,
-                decay_steps=self.decay_steps,
-                min_lr=self.min_lr,
-            )
-            for _ in self.base_lrs
-        ]
-        return new_lrs
-class NoamAnnealing(_LRScheduler):
-    def __init__(
-        self,
-        optimizer,
-        *,
-        d_model,
-        warmup_steps=None,
-        warmup_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        self._normalize = d_model ** (-0.5)
-        assert not (
-            warmup_steps is not None and warmup_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert (
-            warmup_ratio is None or max_steps is not None
-        ), "If there is a ratio, there should be a total steps"
-        # It is necessary to assign all attributes *before* __init__,
-        # as class is wrapped by an inner class.
-        self.max_steps = max_steps
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            self.warmup_steps = int(warmup_ratio * max_steps)
-        else:
-            self.warmup_steps = 0
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed "
-                "by the scheduler, please use `get_last_lr()`.",
-                UserWarning,
-                stacklevel=2,
-            )
-        step = max(1, self.last_epoch)
-        for initial_lr in self.base_lrs:
-            if initial_lr < self.min_lr:
-                raise ValueError(
-                    f"{self} received an initial learning rate "
-                    f"that was lower than the minimum learning rate."
-                )
-        new_lrs = [
-            self._noam_annealing(initial_lr=initial_lr, step=step)
-            for initial_lr in self.base_lrs
-        ]
-        return new_lrs
-    def _noam_annealing(self, initial_lr, step):
-        if self.warmup_steps > 0:
-            mult = self._normalize * min(
-                step ** (-0.5), step * (self.warmup_steps ** (-1.5))
-            )
-        else:
-            mult = self._normalize * step ** (-0.5)
-        out_lr = initial_lr * mult
-        if step > self.warmup_steps:
-            out_lr = max(out_lr, self.min_lr)
-        return out_lr
-class NoamHoldAnnealing(WarmupHoldPolicy):
-    def __init__(
-        self,
-        optimizer,
-        *,
-        max_steps,
-        decay_rate=0.5,
-        min_lr=0.0,
-        last_epoch=-1,
-        **kwargs,
-    ):
-        """
-        From Nemo:
-        Implementation of the Noam Hold Annealing policy
-        from the SqueezeFormer paper.
-        Unlike NoamAnnealing, the peak learning rate
-        can be explicitly set for this scheduler.
-        The schedule first performs linear warmup,
-        then holds the peak LR, then decays with some schedule for
-        the remainder of the steps.
-        Therefore the min-lr is still dependent
-        on the hyper parameters selected.
-        It's schedule is determined by three factors-
-        Warmup Steps: Initial stage, where linear warmup
-            occurs uptil the peak LR is reached. Unlike NoamAnnealing,
-            the peak LR is explicitly stated here instead of a scaling factor.
-        Hold Steps: Intermediate stage, where the peak LR
-            is maintained for some number of steps. In this region,
-            the high peak LR allows the model to converge faster
-            if training is stable. However the high LR
-            may also cause instability during training.
-            Should usually be a significant fraction of training
-            steps (around 30-40% of the entire training steps).
-        Decay Steps: Final stage, where the LR rapidly decays
-            with some scaling rate (set by decay rate).
-            To attain Noam decay, use 0.5,
-            for Squeezeformer recommended decay, use 1.0.
-            The fast decay after prolonged high LR during
-            hold phase allows for rapid convergence.
-        References:
-            - [Squeezeformer:
-            An Efficient Transformer for Automatic Speech Recognition]
-            (https://arxiv.org/abs/2206.00888)
-        Args:
-            optimizer: Pytorch compatible Optimizer object.
-            warmup_steps: Number of training steps in warmup stage
-            warmup_ratio: Ratio of warmup steps to total steps
-            hold_steps: Number of training steps to
-                        hold the learning rate after warm up
-            hold_ratio: Ratio of hold steps to total steps
-            max_steps: Total number of steps while training or `None` for
-                infinite training
-            decay_rate: Float value describing the polynomial decay
-                        after the hold period. Default value
-                        of 0.5 corresponds to Noam decay.
-            min_lr: Minimum learning rate.
-        """
-        self.decay_rate = decay_rate
-        super().__init__(
-            optimizer=optimizer,
-            max_steps=max_steps,
-            last_epoch=last_epoch,
-            min_lr=min_lr,
-            **kwargs,
-        )
-    def _get_lr(self, step):
-        if self.warmup_steps is None or self.warmup_steps == 0:
-            raise ValueError("Noam scheduler cannot be used without warmup steps")
-        if self.hold_steps > 0:
-            hold_steps = self.hold_steps - self.warmup_steps
-        else:
-            hold_steps = 0
-        new_lrs = [
-            _noam_hold_annealing(
-                initial_lr,
-                step=step,
-                warmup_steps=self.warmup_steps,
-                hold_steps=hold_steps,
-                decay_rate=self.decay_rate,
-                min_lr=self.min_lr,
-            )
-            for initial_lr in self.base_lrs
-        ]
-        return new_lrs
-    def set_step(self, step: int):
-        self.last_epoch = step
-class ConstantLR(_LRScheduler):
-    """The ConstantLR scheduler
-    This scheduler keeps a constant lr
-    """
-    def __init__(
-        self,
-        optimizer: torch.optim.Optimizer,
-    ):
-        # __init__() must be invoked before setting field
-        # because step() is also invoked in __init__()
-        super().__init__(optimizer)
-    def get_lr(self):
-        return self.base_lrs
-    def set_step(self, step: int):
-        self.last_epoch = step

cosyvoice/utils/train_utils.py DELETED Viewed

@@ -1,350 +0,0 @@
-# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
-#               2023 Horizon Inc. (authors: Xingchen Song)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import nullcontext
-import logging
-import os
-import torch
-import json
-import re
-import datetime
-import yaml
-import deepspeed
-import torch.optim as optim
-import torch.distributed as dist
-from torch.utils.tensorboard import SummaryWriter
-from torch.utils.data import DataLoader
-from torch.nn.utils import clip_grad_norm_
-from deepspeed.runtime.zero.stage_1_and_2 import (
-    estimate_zero2_model_states_mem_needs_all_live,
-)
-from cosyvoice.dataset.dataset import Dataset
-from cosyvoice.utils.scheduler import (
-    WarmupLR,
-    NoamHoldAnnealing,
-    ConstantLR,
-)
-def init_distributed(args):
-    world_size = int(os.environ.get("WORLD_SIZE", 1))
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    rank = int(os.environ.get("RANK", 0))
-    logging.info(
-        "training on multiple gpus, this gpu {}".format(local_rank)
-        + ", rank {}, world_size {}".format(rank, world_size)
-    )
-    if args.train_engine == "torch_ddp":
-        torch.cuda.set_device(local_rank)
-        dist.init_process_group(args.dist_backend)
-    else:
-        deepspeed.init_distributed(dist_backend=args.dist_backend)
-    return world_size, local_rank, rank
-def init_dataset_and_dataloader(args, configs):
-    train_dataset = Dataset(
-        args.train_data,
-        data_pipeline=configs["data_pipeline"],
-        mode="train",
-        shuffle=True,
-        partition=True,
-    )
-    cv_dataset = Dataset(
-        args.cv_data,
-        data_pipeline=configs["data_pipeline"],
-        mode="train",
-        shuffle=False,
-        partition=False,
-    )
-    # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
-    train_data_loader = DataLoader(
-        train_dataset,
-        batch_size=None,
-        pin_memory=args.pin_memory,
-        num_workers=args.num_workers,
-        prefetch_factor=args.prefetch,
-    )
-    cv_data_loader = DataLoader(
-        cv_dataset,
-        batch_size=None,
-        pin_memory=args.pin_memory,
-        num_workers=args.num_workers,
-        prefetch_factor=args.prefetch,
-    )
-    return train_dataset, cv_dataset, train_data_loader, cv_data_loader
-def check_modify_and_save_config(args, configs):
-    if args.train_engine == "torch_ddp":
-        configs["train_conf"]["dtype"] = "fp32"
-    else:
-        with open(args.deepspeed_config, "r") as fin:
-            ds_configs = json.load(fin)
-        if "fp16" in ds_configs and ds_configs["fp16"]["enabled"]:
-            configs["train_conf"]["dtype"] = "fp16"
-        elif "bf16" in ds_configs and ds_configs["bf16"]["enabled"]:
-            configs["train_conf"]["dtype"] = "bf16"
-        else:
-            configs["train_conf"]["dtype"] = "fp32"
-        assert ds_configs["train_micro_batch_size_per_gpu"] == 1
-        # if use deepspeed, override ddp config
-        configs["train_conf"]["save_per_step"] = int(
-            configs["train_conf"]["save_per_step"]
-            * configs["train_conf"]["accum_grad"]
-            / ds_configs["gradient_accumulation_steps"]
-        )
-        configs["train_conf"]["accum_grad"] = ds_configs["gradient_accumulation_steps"]
-        configs["train_conf"]["grad_clip"] = ds_configs["gradient_clipping"]
-        configs["train_conf"]["log_interval"] = ds_configs["steps_per_print"]
-    return configs
-def wrap_cuda_model(args, model):
-    local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
-    world_size = int(os.environ.get("WORLD_SIZE", 1))
-    if args.train_engine == "torch_ddp":  # native pytorch ddp
-        assert torch.cuda.is_available()
-        model.cuda()
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, find_unused_parameters=True
-        )
-    else:
-        if int(os.environ.get("RANK", 0)) == 0:
-            logging.info("Estimating model states memory needs (zero2)...")
-            estimate_zero2_model_states_mem_needs_all_live(
-                model,
-                num_gpus_per_node=local_world_size,
-                num_nodes=world_size // local_world_size,
-            )
-    return model
-def init_optimizer_and_scheduler(args, configs, model):
-    if configs["train_conf"]["optim"] == "adam":
-        optimizer = optim.Adam(
-            model.parameters(), **configs["train_conf"]["optim_conf"]
-        )
-    elif configs["train_conf"]["optim"] == "adamw":
-        optimizer = optim.AdamW(
-            model.parameters(), **configs["train_conf"]["optim_conf"]
-        )
-    else:
-        raise ValueError("unknown optimizer: " + configs["train_conf"])
-    if configs["train_conf"]["scheduler"] == "warmuplr":
-        scheduler_type = WarmupLR
-        scheduler = WarmupLR(optimizer, **configs["train_conf"]["scheduler_conf"])
-    elif configs["train_conf"]["scheduler"] == "NoamHoldAnnealing":
-        scheduler_type = NoamHoldAnnealing
-        scheduler = NoamHoldAnnealing(
-            optimizer, **configs["train_conf"]["scheduler_conf"]
-        )
-    elif configs["train_conf"]["scheduler"] == "constantlr":
-        scheduler_type = ConstantLR
-        scheduler = ConstantLR(optimizer)
-    else:
-        raise ValueError("unknown scheduler: " + configs["train_conf"])
-    # use deepspeed optimizer for speedup
-    if args.train_engine == "deepspeed":
-        def scheduler(opt):
-            return scheduler_type(opt, **configs["train_conf"]["scheduler_conf"])
-        model, optimizer, _, scheduler = deepspeed.initialize(
-            args=args,
-            model=model,
-            optimizer=None,
-            lr_scheduler=scheduler,
-            model_parameters=model.parameters(),
-        )
-    return model, optimizer, scheduler
-def init_summarywriter(args):
-    writer = None
-    if int(os.environ.get("RANK", 0)) == 0:
-        os.makedirs(args.model_dir, exist_ok=True)
-        writer = SummaryWriter(args.tensorboard_dir)
-    return writer
-def save_model(model, model_name, info_dict):
-    rank = int(os.environ.get("RANK", 0))
-    model_dir = info_dict["model_dir"]
-    save_model_path = os.path.join(model_dir, "{}.pt".format(model_name))
-    if info_dict["train_engine"] == "torch_ddp":
-        if rank == 0:
-            torch.save(model.module.state_dict(), save_model_path)
-    else:
-        with torch.no_grad():
-            model.save_checkpoint(
-                save_dir=model_dir, tag=model_name, client_state=info_dict
-            )
-    if rank == 0:
-        info_path = re.sub(".pt$", ".yaml", save_model_path)
-        info_dict["save_time"] = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
-        with open(info_path, "w") as fout:
-            data = yaml.dump(info_dict)
-            fout.write(data)
-        logging.info(
-            "[Rank {}] Checkpoint: save to checkpoint {}".format(rank, save_model_path)
-        )
-def cosyvoice_join(group_join, info_dict):
-    world_size = int(os.environ.get("WORLD_SIZE", 1))
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    rank = int(os.environ.get("RANK", 0))
-    if info_dict["batch_idx"] != 0:
-        # we try to join all rank in both ddp and deepspeed mode, in case different rank has different lr
-        try:
-            dist.monitored_barrier(
-                group=group_join, timeout=group_join.options._timeout
-            )
-            return False
-        except RuntimeError as e:
-            logging.info(
-                "Detected uneven workload distribution: {}\n".format(e)
-                + "Break current worker to manually join all workers, "
-                + "world_size {}, current rank {}, current local_rank {}\n".format(
-                    world_size, rank, local_rank
-                )
-            )
-            return True
-    else:
-        return False
-def batch_forward(model, batch, info_dict):
-    device = int(os.environ.get("LOCAL_RANK", 0))
-    dtype = info_dict["dtype"]
-    if dtype == "fp16":
-        dtype = torch.float16
-    elif dtype == "bf16":
-        dtype = torch.bfloat16
-    else:  # fp32
-        dtype = torch.float32
-    if info_dict["train_engine"] == "torch_ddp":
-        autocast = nullcontext()
-    else:
-        autocast = torch.cuda.amp.autocast(
-            enabled=True, dtype=dtype, cache_enabled=False
-        )
-    with autocast:
-        info_dict["loss_dict"] = model(batch, device)
-    return info_dict
-def batch_backward(model, info_dict):
-    if info_dict["train_engine"] == "deepspeed":
-        scaled_loss = model.backward(info_dict["loss_dict"]["loss"])
-    else:
-        scaled_loss = info_dict["loss_dict"]["loss"] / info_dict["accum_grad"]
-        scaled_loss.backward()
-    info_dict["loss_dict"]["loss"] = scaled_loss
-    return info_dict
-def update_parameter_and_lr(model, optimizer, scheduler, info_dict):
-    grad_norm = 0.0
-    if info_dict["train_engine"] == "deepspeed":
-        info_dict["is_gradient_accumulation_boundary"] = (
-            model.is_gradient_accumulation_boundary()
-        )
-        model.step()
-        grad_norm = model.get_global_grad_norm()
-    elif (info_dict["batch_idx"] + 1) % info_dict["accum_grad"] == 0:
-        grad_norm = clip_grad_norm_(model.parameters(), info_dict["grad_clip"])
-        if torch.isfinite(grad_norm):
-            optimizer.step()
-        optimizer.zero_grad()
-        scheduler.step()
-    info_dict["lr"] = optimizer.param_groups[0]["lr"]
-    info_dict["grad_norm"] = grad_norm
-    return info_dict
-def log_per_step(writer, info_dict):
-    tag = info_dict["tag"]
-    epoch = info_dict.get("epoch", 0)
-    step = info_dict["step"]
-    batch_idx = info_dict["batch_idx"]
-    loss_dict = info_dict["loss_dict"]
-    rank = int(os.environ.get("RANK", 0))
-    # only rank 0 write to tensorboard to avoid multi-process write
-    if writer is not None:
-        if (
-            info_dict["train_engine"] == "deepspeed"
-            and info_dict["is_gradient_accumulation_boundary"] is True
-        ) or (
-            info_dict["train_engine"] == "torch_ddp"
-            and (info_dict["batch_idx"] + 1) % info_dict["accum_grad"] == 0
-        ):
-            for k in ["epoch", "lr", "grad_norm"]:
-                writer.add_scalar("{}/{}".format(tag, k), info_dict[k], step + 1)
-            for k, v in loss_dict.items():
-                writer.add_scalar("{}/{}".format(tag, k), v, step + 1)
-    # TRAIN & CV, Shell log (stdout)
-    if (info_dict["batch_idx"] + 1) % info_dict["log_interval"] == 0:
-        log_str = "{} Batch {}/{} ".format(tag, epoch, batch_idx + 1)
-        for name, value in loss_dict.items():
-            log_str += "{} {:.6f} ".format(name, value)
-        if tag == "TRAIN":
-            log_str += "lr {:.8f} grad_norm {:.6f}".format(
-                info_dict["lr"], info_dict["grad_norm"]
-            )
-        log_str += " rank {}".format(rank)
-        logging.debug(log_str)
-def log_per_save(writer, info_dict):
-    tag = info_dict["tag"]
-    epoch = info_dict["epoch"]
-    step = info_dict["step"]
-    loss_dict = info_dict["loss_dict"]
-    lr = info_dict["lr"]
-    rank = int(os.environ.get("RANK", 0))
-    logging.info(
-        "Epoch {} Step {} CV info lr {} {} rank {}".format(
-            epoch,
-            step + 1,
-            lr,
-            rank,
-            " ".join(["{}_{}".format(k, v) for k, v in loss_dict.items()]),
-        )
-    )
-    if writer is not None:
-        for k in ["epoch", "lr"]:
-            writer.add_scalar("{}/{}".format(tag, k), info_dict[k], step + 1)
-        for k, v in loss_dict.items():
-            writer.add_scalar("{}/{}".format(tag, k), v, step + 1)

funasr_detach/__init__.py DELETED Viewed

@@ -1,38 +0,0 @@
-"""Initialize funasr package."""
-import os
-import pkgutil
-import importlib
-dirname = os.path.dirname(__file__)
-version_file = os.path.join(dirname, "version.txt")
-with open(version_file, "r") as f:
-    __version__ = f.read().strip()
-import importlib
-import pkgutil
-def import_submodules(package, recursive=True):
-    if isinstance(package, str):
-        package = importlib.import_module(package)
-    results = {}
-    for loader, name, is_pkg in pkgutil.walk_packages(
-        package.__path__, package.__name__ + "."
-    ):
-        try:
-            results[name] = importlib.import_module(name)
-        except Exception as e:
-            # 如果想要看到导入错误的具体信息，可以取消注释下面的行
-            # print(f"Failed to import {name}: {e}")
-            pass
-        if recursive and is_pkg:
-            results.update(import_submodules(name))
-    return results
-import_submodules(__name__)
-from funasr_detach.auto.auto_model import AutoModel
-from funasr_detach.auto.auto_frontend import AutoFrontend

funasr_detach/auto/__init__.py DELETED Viewed

File without changes

funasr_detach/auto/auto_frontend.py DELETED Viewed

@@ -1,90 +0,0 @@
-import time
-import logging
-from tqdm import tqdm
-from funasr_detach.register import tables
-from funasr_detach.download.download_from_hub import download_model
-from funasr_detach.utils.load_utils import load_audio_text_image_video, extract_fbank
-from funasr_detach.auto.auto_model import prepare_data_iterator
-from funasr_detach.auto.auto_model import prepare_data_iterator
-class AutoFrontend:
-    def __init__(self, **kwargs):
-        assert "model" in kwargs
-        if "model_conf" not in kwargs:
-            logging.info(
-                "download models from model hub: {}".format(
-                    kwargs.get("model_hub", "ms")
-                )
-            )
-            kwargs = download_model(**kwargs)
-        # build frontend
-        frontend = kwargs.get("frontend", None)
-        if frontend is not None:
-            frontend_class = tables.frontend_classes.get(frontend)
-            frontend = frontend_class(**kwargs["frontend_conf"])
-        self.frontend = frontend
-        if "frontend" in kwargs:
-            del kwargs["frontend"]
-        self.kwargs = kwargs
-    def __call__(self, input, input_len=None, kwargs=None, **cfg):
-        kwargs = self.kwargs if kwargs is None else kwargs
-        kwargs.update(cfg)
-        key_list, data_list = prepare_data_iterator(input, input_len=input_len)
-        batch_size = kwargs.get("batch_size", 1)
-        device = kwargs.get("device", "cpu")
-        if device == "cpu":
-            batch_size = 1
-        meta_data = {}
-        result_list = []
-        num_samples = len(data_list)
-        pbar = tqdm(colour="blue", total=num_samples + 1, dynamic_ncols=True)
-        time0 = time.perf_counter()
-        for beg_idx in range(0, num_samples, batch_size):
-            end_idx = min(num_samples, beg_idx + batch_size)
-            data_batch = data_list[beg_idx:end_idx]
-            key_batch = key_list[beg_idx:end_idx]
-            # extract fbank feats
-            time1 = time.perf_counter()
-            audio_sample_list = load_audio_text_image_video(
-                data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000)
-            )
-            time2 = time.perf_counter()
-            meta_data["load_data"] = f"{time2 - time1:0.3f}"
-            speech, speech_lengths = extract_fbank(
-                audio_sample_list,
-                data_type=kwargs.get("data_type", "sound"),
-                frontend=self.frontend,
-                **kwargs,
-            )
-            time3 = time.perf_counter()
-            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
-            meta_data["batch_data_time"] = (
-                speech_lengths.sum().item()
-                * self.frontend.frame_shift
-                * self.frontend.lfr_n
-                / 1000
-            )
-            speech.to(device=device), speech_lengths.to(device=device)
-            batch = {"input": speech, "input_len": speech_lengths, "key": key_batch}
-            result_list.append(batch)
-            pbar.update(1)
-            description = f"{meta_data}, "
-            pbar.set_description(description)
-        time_end = time.perf_counter()
-        pbar.set_description(f"time escaped total: {time_end - time0:0.3f}")
-        return result_list

funasr_detach/auto/auto_model.py DELETED Viewed

@@ -1,573 +0,0 @@
-import json
-import time
-import copy
-import torch
-import random
-import string
-import logging
-import os.path
-import numpy as np
-from tqdm import tqdm
-from funasr_detach.register import tables
-from funasr_detach.utils.load_utils import load_bytes
-from funasr_detach.download.file import download_from_url
-from funasr_detach.download.download_from_hub import download_model
-from funasr_detach.utils.vad_utils import slice_padding_audio_samples
-from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
-from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
-from funasr_detach.utils.load_utils import load_audio_text_image_video
-from funasr_detach.utils.timestamp_tools import timestamp_sentence
-from funasr_detach.models.campplus.utils import sv_chunk, postprocess, distribute_spk
-try:
-    from funasr_detach.models.campplus.cluster_backend import ClusterBackend
-except:
-    print("If you want to use the speaker diarization, please `pip install hdbscan`")
-def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
-    """
-    :param input:
-    :param input_len:
-    :param data_type:
-    :param frontend:
-    :return:
-    """
-    data_list = []
-    key_list = []
-    filelist = [".scp", ".txt", ".json", ".jsonl"]
-    chars = string.ascii_letters + string.digits
-    if isinstance(data_in, str) and data_in.startswith("http"):  # url
-        data_in = download_from_url(data_in)
-    if isinstance(data_in, str) and os.path.exists(
-        data_in
-    ):  # wav_path; filelist: wav.scp, file.jsonl;text.txt;
-        _, file_extension = os.path.splitext(data_in)
-        file_extension = file_extension.lower()
-        if file_extension in filelist:  # filelist: wav.scp, file.jsonl;text.txt;
-            with open(data_in, encoding="utf-8") as fin:
-                for line in fin:
-                    key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
-                    if data_in.endswith(
-                        ".jsonl"
-                    ):  # file.jsonl: json.dumps({"source": data})
-                        lines = json.loads(line.strip())
-                        data = lines["source"]
-                        key = data["key"] if "key" in data else key
-                    else:  # filelist, wav.scp, text.txt: id \t data or data
-                        lines = line.strip().split(maxsplit=1)
-                        data = lines[1] if len(lines) > 1 else lines[0]
-                        key = lines[0] if len(lines) > 1 else key
-                    data_list.append(data)
-                    key_list.append(key)
-        else:
-            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
-            data_list = [data_in]
-            key_list = [key]
-    elif isinstance(data_in, (list, tuple)):
-        if data_type is not None and isinstance(
-            data_type, (list, tuple)
-        ):  # mutiple inputs
-            data_list_tmp = []
-            for data_in_i, data_type_i in zip(data_in, data_type):
-                key_list, data_list_i = prepare_data_iterator(
-                    data_in=data_in_i, data_type=data_type_i
-                )
-                data_list_tmp.append(data_list_i)
-            data_list = []
-            for item in zip(*data_list_tmp):
-                data_list.append(item)
-        else:
-            # [audio sample point, fbank, text]
-            data_list = data_in
-            key_list = [
-                "rand_key_" + "".join(random.choice(chars) for _ in range(13))
-                for _ in range(len(data_in))
-            ]
-    else:  # raw text; audio sample point, fbank; bytes
-        if isinstance(data_in, bytes):  # audio bytes
-            data_in = load_bytes(data_in)
-        if key is None:
-            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
-        data_list = [data_in]
-        key_list = [key]
-    return key_list, data_list
-class AutoModel:
-    def __init__(self, **kwargs):
-        if not kwargs.get("disable_log", False):
-            tables.print()
-        model, kwargs = self.build_model(**kwargs)
-        # if vad_model is not None, build vad model else None
-        vad_model = kwargs.get("vad_model", None)
-        vad_kwargs = kwargs.get("vad_model_revision", None)
-        if vad_model is not None:
-            logging.info("Building VAD model.")
-            vad_kwargs = {
-                "model": vad_model,
-                "model_revision": vad_kwargs,
-                "device": kwargs["device"],
-            }
-            vad_model, vad_kwargs = self.build_model(**vad_kwargs)
-        # if punc_model is not None, build punc model else None
-        punc_model = kwargs.get("punc_model", None)
-        punc_kwargs = kwargs.get("punc_model_revision", None)
-        if punc_model is not None:
-            logging.info("Building punc model.")
-            punc_kwargs = {
-                "model": punc_model,
-                "model_revision": punc_kwargs,
-                "device": kwargs["device"],
-            }
-            punc_model, punc_kwargs = self.build_model(**punc_kwargs)
-        # if spk_model is not None, build spk model else None
-        spk_model = kwargs.get("spk_model", None)
-        spk_kwargs = kwargs.get("spk_model_revision", None)
-        if spk_model is not None:
-            logging.info("Building SPK model.")
-            spk_kwargs = {
-                "model": spk_model,
-                "model_revision": spk_kwargs,
-                "device": kwargs["device"],
-            }
-            spk_model, spk_kwargs = self.build_model(**spk_kwargs)
-            self.cb_model = ClusterBackend().to(kwargs["device"])
-            spk_mode = kwargs.get("spk_mode", "punc_segment")
-            if spk_mode not in ["default", "vad_segment", "punc_segment"]:
-                logging.error(
-                    "spk_mode should be one of default, vad_segment and punc_segment."
-                )
-            self.spk_mode = spk_mode
-        self.kwargs = kwargs
-        self.model = model
-        self.vad_model = vad_model
-        self.vad_kwargs = vad_kwargs
-        self.punc_model = punc_model
-        self.punc_kwargs = punc_kwargs
-        self.spk_model = spk_model
-        self.spk_kwargs = spk_kwargs
-        self.model_path = kwargs.get("model_path")
-    def build_model(self, **kwargs):
-        assert "model" in kwargs
-        if "model_conf" not in kwargs:
-            logging.info(
-                "download models from model hub: {}".format(
-                    kwargs.get("model_hub", "ms")
-                )
-            )
-            kwargs = download_model(**kwargs)
-        set_all_random_seed(kwargs.get("seed", 0))
-        device = kwargs.get("device", "cuda")
-        if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
-            device = "cpu"
-            kwargs["batch_size"] = 1
-        kwargs["device"] = device
-        if kwargs.get("ncpu", None):
-            torch.set_num_threads(kwargs.get("ncpu"))
-        # build tokenizer
-        tokenizer = kwargs.get("tokenizer", None)
-        if tokenizer is not None:
-            tokenizer_class = tables.tokenizer_classes.get(tokenizer)
-            tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
-            kwargs["tokenizer"] = tokenizer
-            kwargs["token_list"] = tokenizer.token_list
-            vocab_size = len(tokenizer.token_list)
-        else:
-            vocab_size = -1
-        # build frontend
-        frontend = kwargs.get("frontend", None)
-        if frontend is not None:
-            frontend_class = tables.frontend_classes.get(frontend)
-            frontend = frontend_class(**kwargs["frontend_conf"])
-            kwargs["frontend"] = frontend
-            kwargs["input_size"] = frontend.output_size()
-        # build model
-        model_class = tables.model_classes.get(kwargs["model"])
-        model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
-        model.to(device)
-        # init_param
-        init_param = kwargs.get("init_param", None)
-        if init_param is not None:
-            logging.info(f"Loading pretrained params from {init_param}")
-            load_pretrained_model(
-                model=model,
-                path=init_param,
-                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", False),
-                oss_bucket=kwargs.get("oss_bucket", None),
-                scope_map=kwargs.get("scope_map", None),
-                excludes=kwargs.get("excludes", None),
-            )
-        return model, kwargs
-    def __call__(self, *args, **cfg):
-        kwargs = self.kwargs
-        kwargs.update(cfg)
-        res = self.model(*args, kwargs)
-        return res
-    def generate(self, input, input_len=None, **cfg):
-        if self.vad_model is None:
-            return self.inference(input, input_len=input_len, **cfg)
-        else:
-            return self.inference_with_vad(input, input_len=input_len, **cfg)
-    def inference(
-        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
-    ):
-        kwargs = self.kwargs if kwargs is None else kwargs
-        kwargs.update(cfg)
-        model = self.model if model is None else model
-        model = model.cuda()
-        model.eval()
-        batch_size = kwargs.get("batch_size", 1)
-        # if kwargs.get("device", "cpu") == "cpu":
-        #     batch_size = 1
-        key_list, data_list = prepare_data_iterator(
-            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
-        )
-        speed_stats = {}
-        asr_result_list = []
-        num_samples = len(data_list)
-        disable_pbar = kwargs.get("disable_pbar", False)
-        pbar = (
-            tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
-            if not disable_pbar
-            else None
-        )
-        time_speech_total = 0.0
-        time_escape_total = 0.0
-        for beg_idx in range(0, num_samples, batch_size):
-            end_idx = min(num_samples, beg_idx + batch_size)
-            data_batch = data_list[beg_idx:end_idx]
-            key_batch = key_list[beg_idx:end_idx]
-            batch = {"data_in": data_batch, "key": key_batch}
-            if (end_idx - beg_idx) == 1 and kwargs.get(
-                "data_type", None
-            ) == "fbank":  # fbank
-                batch["data_in"] = data_batch[0]
-                batch["data_lengths"] = input_len
-            time1 = time.perf_counter()
-            with torch.no_grad():
-                results, meta_data = model.inference(**batch, **kwargs)
-            time2 = time.perf_counter()
-            asr_result_list.extend(results)
-            # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
-            batch_data_time = meta_data.get("batch_data_time", -1)
-            time_escape = time2 - time1
-            speed_stats["load_data"] = meta_data.get("load_data", 0.0)
-            speed_stats["extract_feat"] = meta_data.get("extract_feat", 0.0)
-            speed_stats["forward"] = f"{time_escape:0.3f}"
-            speed_stats["batch_size"] = f"{len(results)}"
-            speed_stats["time_cost"] = f"{(time_escape)}"
-            speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}"
-            description = f"{speed_stats}, "
-            if pbar:
-                pbar.update(1)
-                pbar.set_description(description)
-            time_speech_total += batch_data_time
-            time_escape_total += time_escape
-        if pbar:
-            # pbar.update(1)
-            pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}")
-        torch.cuda.empty_cache()
-        return asr_result_list
-    def inference_with_vad(self, input, input_len=None, **cfg):
-        # step.1: compute the vad model
-        self.vad_kwargs.update(cfg)
-        beg_vad = time.time()
-        res = self.inference(
-            input,
-            input_len=input_len,
-            model=self.vad_model,
-            kwargs=self.vad_kwargs,
-            **cfg,
-        )
-        end_vad = time.time()
-        print(f"time cost vad: {end_vad - beg_vad:0.3f}")
-        # step.2 compute asr model
-        model = self.model
-        kwargs = self.kwargs
-        kwargs.update(cfg)
-        batch_size = int(kwargs.get("batch_size_s", 300)) * 1000
-        batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60)) * 1000
-        kwargs["batch_size"] = batch_size
-        key_list, data_list = prepare_data_iterator(
-            input, input_len=input_len, data_type=kwargs.get("data_type", None)
-        )
-        results_ret_list = []
-        time_speech_total_all_samples = 1e-6
-        beg_total = time.time()
-        pbar_total = tqdm(colour="red", total=len(res), dynamic_ncols=True)
-        for i in range(len(res)):
-            key = res[i]["key"]
-            vadsegments = res[i]["value"]
-            input_i = data_list[i]
-            speech = load_audio_text_image_video(
-                input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)
-            )
-            speech_lengths = len(speech)
-            n = len(vadsegments)
-            data_with_index = [(vadsegments[i], i) for i in range(n)]
-            sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
-            results_sorted = []
-            if not len(sorted_data):
-                logging.info("decoding, utt: {}, empty speech".format(key))
-                continue
-            if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
-                batch_size = max(
-                    batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]
-                )
-            batch_size_ms_cum = 0
-            beg_idx = 0
-            beg_asr_total = time.time()
-            time_speech_total_per_sample = speech_lengths / 16000
-            time_speech_total_all_samples += time_speech_total_per_sample
-            all_segments = []
-            for j, _ in enumerate(range(0, n)):
-                # pbar_sample.update(1)
-                batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
-                if (
-                    j < n - 1
-                    and (
-                        batch_size_ms_cum
-                        + sorted_data[j + 1][0][1]
-                        - sorted_data[j + 1][0][0]
-                    )
-                    < batch_size
-                    and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
-                    < batch_size_threshold_ms
-                ):
-                    continue
-                batch_size_ms_cum = 0
-                end_idx = j + 1
-                speech_j, speech_lengths_j = slice_padding_audio_samples(
-                    speech, speech_lengths, sorted_data[beg_idx:end_idx]
-                )
-                results = self.inference(
-                    speech_j,
-                    input_len=None,
-                    model=model,
-                    kwargs=kwargs,
-                    disable_pbar=True,
-                    **cfg,
-                )
-                if self.spk_model is not None:
-                    # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]]
-                    for _b in range(len(speech_j)):
-                        vad_segments = [
-                            [
-                                sorted_data[beg_idx:end_idx][_b][0][0] / 1000.0,
-                                sorted_data[beg_idx:end_idx][_b][0][1] / 1000.0,
-                                np.array(speech_j[_b]),
-                            ]
-                        ]
-                        segments = sv_chunk(vad_segments)
-                        all_segments.extend(segments)
-                        speech_b = [i[2] for i in segments]
-                        spk_res = self.inference(
-                            speech_b,
-                            input_len=None,
-                            model=self.spk_model,
-                            kwargs=kwargs,
-                            disable_pbar=True,
-                            **cfg,
-                        )
-                        results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
-                beg_idx = end_idx
-                if len(results) < 1:
-                    continue
-                results_sorted.extend(results)
-            restored_data = [0] * n
-            for j in range(n):
-                index = sorted_data[j][1]
-                restored_data[index] = results_sorted[j]
-            result = {}
-            # results combine for texts, timestamps, speaker embeddings and others
-            # TODO: rewrite for clean code
-            for j in range(n):
-                for k, v in restored_data[j].items():
-                    if k.startswith("timestamp"):
-                        if k not in result:
-                            result[k] = []
-                        for t in restored_data[j][k]:
-                            t[0] += vadsegments[j][0]
-                            t[1] += vadsegments[j][0]
-                        result[k].extend(restored_data[j][k])
-                    elif k == "spk_embedding":
-                        if k not in result:
-                            result[k] = restored_data[j][k]
-                        else:
-                            result[k] = torch.cat(
-                                [result[k], restored_data[j][k]], dim=0
-                            )
-                    elif "text" in k:
-                        if k not in result:
-                            result[k] = restored_data[j][k]
-                        else:
-                            result[k] += " " + restored_data[j][k]
-                    else:
-                        if k not in result:
-                            result[k] = restored_data[j][k]
-                        else:
-                            result[k] += restored_data[j][k]
-            return_raw_text = kwargs.get("return_raw_text", False)
-            # step.3 compute punc model
-            if self.punc_model is not None:
-                self.punc_kwargs.update(cfg)
-                punc_res = self.inference(
-                    result["text"],
-                    model=self.punc_model,
-                    kwargs=self.punc_kwargs,
-                    disable_pbar=True,
-                    **cfg,
-                )
-                raw_text = copy.copy(result["text"])
-                if return_raw_text:
-                    result["raw_text"] = raw_text
-                result["text"] = punc_res[0]["text"]
-            else:
-                raw_text = None
-            # speaker embedding cluster after resorted
-            if self.spk_model is not None and kwargs.get("return_spk_res", True):
-                if raw_text is None:
-                    logging.error("Missing punc_model, which is required by spk_model.")
-                all_segments = sorted(all_segments, key=lambda x: x[0])
-                spk_embedding = result["spk_embedding"]
-                labels = self.cb_model(
-                    spk_embedding.cpu(), oracle_num=kwargs.get("preset_spk_num", None)
-                )
-                # del result['spk_embedding']
-                sv_output = postprocess(all_segments, None, labels, spk_embedding.cpu())
-                if self.spk_mode == "vad_segment":  # recover sentence_list
-                    sentence_list = []
-                    for res, vadsegment in zip(restored_data, vadsegments):
-                        if "timestamp" not in res:
-                            logging.error(
-                                "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
-                                           and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
-                                           can predict timestamp, and speaker diarization relies on timestamps."
-                            )
-                        sentence_list.append(
-                            {
-                                "start": vadsegment[0],
-                                "end": vadsegment[1],
-                                "sentence": res["text"],
-                                "timestamp": res["timestamp"],
-                            }
-                        )
-                elif self.spk_mode == "punc_segment":
-                    if "timestamp" not in result:
-                        logging.error(
-                            "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
-                                       and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
-                                       can predict timestamp, and speaker diarization relies on timestamps."
-                        )
-                    sentence_list = timestamp_sentence(
-                        punc_res[0]["punc_array"],
-                        result["timestamp"],
-                        raw_text,
-                        return_raw_text=return_raw_text,
-                    )
-                distribute_spk(sentence_list, sv_output)
-                result["sentence_info"] = sentence_list
-            elif kwargs.get("sentence_timestamp", False):
-                sentence_list = timestamp_sentence(
-                    punc_res[0]["punc_array"],
-                    result["timestamp"],
-                    raw_text,
-                    return_raw_text=return_raw_text,
-                )
-                result["sentence_info"] = sentence_list
-            if "spk_embedding" in result:
-                del result["spk_embedding"]
-            result["key"] = key
-            results_ret_list.append(result)
-            end_asr_total = time.time()
-            time_escape_total_per_sample = end_asr_total - beg_asr_total
-            pbar_total.update(1)
-            pbar_total.set_description(
-                f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, "
-                f"time_speech: {time_speech_total_per_sample: 0.3f}, "
-                f"time_escape: {time_escape_total_per_sample:0.3f}"
-            )
-        return results_ret_list
-    def infer_encoder(
-        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
-    ):
-        kwargs = self.kwargs if kwargs is None else kwargs
-        kwargs.update(cfg)
-        model = self.model if model is None else model
-        model = model.cuda()
-        model.eval()
-        batch_size = kwargs.get("batch_size", 1)
-        key_list, data_list = prepare_data_iterator(
-            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
-        )
-        asr_result_list = []
-        num_samples = len(data_list)
-        for beg_idx in range(0, num_samples, batch_size):
-            end_idx = min(num_samples, beg_idx + batch_size)
-            data_batch = data_list[beg_idx:end_idx]
-            key_batch = key_list[beg_idx:end_idx]
-            batch = {"data_in": data_batch, "key": key_batch}
-            if (end_idx - beg_idx) == 1 and kwargs.get(
-                "data_type", None
-            ) == "fbank":  # fbank
-                batch["data_in"] = data_batch[0]
-                batch["data_lengths"] = input_len
-            with torch.no_grad():
-                results, meta_data, cache = model.infer_encoder(**batch, **kwargs)
-            asr_result_list.extend(results)
-        torch.cuda.empty_cache()
-        return asr_result_list, cache

funasr_detach/auto/auto_tokenizer.py DELETED Viewed

@@ -1,7 +0,0 @@
-class AutoTokenizer:
-    """
-    Undo
-    """
-    def __init__(self):
-        pass

funasr_detach/bin/__init__.py DELETED Viewed

File without changes

funasr_detach/bin/compute_audio_cmvn.py DELETED Viewed

@@ -1,152 +0,0 @@
-import os
-import json
-import numpy as np
-import torch
-import hydra
-import logging
-from omegaconf import DictConfig, OmegaConf
-from funasr_detach.register import tables
-from funasr_detach.download.download_from_hub import download_model
-from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
-@hydra.main(config_name=None, version_base=None)
-def main_hydra(kwargs: DictConfig):
-    if kwargs.get("debug", False):
-        import pdb
-        pdb.set_trace()
-    assert "model" in kwargs
-    if "model_conf" not in kwargs:
-        logging.info(
-            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
-        )
-        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
-    main(**kwargs)
-def main(**kwargs):
-    print(kwargs)
-    # set random seed
-    tables.print()
-    set_all_random_seed(kwargs.get("seed", 0))
-    torch.backends.cudnn.enabled = kwargs.get(
-        "cudnn_enabled", torch.backends.cudnn.enabled
-    )
-    torch.backends.cudnn.benchmark = kwargs.get(
-        "cudnn_benchmark", torch.backends.cudnn.benchmark
-    )
-    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
-    tokenizer = kwargs.get("tokenizer", None)
-    # build frontend if frontend is none None
-    frontend = kwargs.get("frontend", None)
-    if frontend is not None:
-        frontend_class = tables.frontend_classes.get(frontend)
-        frontend = frontend_class(**kwargs["frontend_conf"])
-        kwargs["frontend"] = frontend
-        kwargs["input_size"] = frontend.output_size()
-    # dataset
-    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
-    dataset_train = dataset_class(
-        kwargs.get("train_data_set_list"),
-        frontend=frontend,
-        tokenizer=None,
-        is_training=False,
-        **kwargs.get("dataset_conf")
-    )
-    # dataloader
-    batch_sampler = kwargs["dataset_conf"].get(
-        "batch_sampler", "DynamicBatchLocalShuffleSampler"
-    )
-    batch_sampler_train = None
-    if batch_sampler is not None:
-        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
-        dataset_conf = kwargs.get("dataset_conf")
-        dataset_conf["batch_type"] = "example"
-        dataset_conf["batch_size"] = 1
-        batch_sampler_train = batch_sampler_class(
-            dataset_train, is_training=False, **dataset_conf
-        )
-    dataloader_train = torch.utils.data.DataLoader(
-        dataset_train,
-        collate_fn=dataset_train.collator,
-        batch_sampler=batch_sampler_train,
-        num_workers=int(kwargs.get("dataset_conf").get("num_workers", 4)),
-        pin_memory=True,
-    )
-    iter_stop = int(kwargs.get("scale", 1.0) * len(dataloader_train))
-    total_frames = 0
-    for batch_idx, batch in enumerate(dataloader_train):
-        if batch_idx >= iter_stop:
-            break
-        fbank = batch["speech"].numpy()[0, :, :]
-        if total_frames == 0:
-            mean_stats = np.sum(fbank, axis=0)
-            var_stats = np.sum(np.square(fbank), axis=0)
-        else:
-            mean_stats += np.sum(fbank, axis=0)
-            var_stats += np.sum(np.square(fbank), axis=0)
-        total_frames += fbank.shape[0]
-    cmvn_info = {
-        "mean_stats": list(mean_stats.tolist()),
-        "var_stats": list(var_stats.tolist()),
-        "total_frames": total_frames,
-    }
-    cmvn_file = kwargs.get("cmvn_file", "cmvn.json")
-    # import pdb;pdb.set_trace()
-    with open(cmvn_file, "w") as fout:
-        fout.write(json.dumps(cmvn_info))
-    mean = -1.0 * mean_stats / total_frames
-    var = 1.0 / np.sqrt(var_stats / total_frames - mean * mean)
-    dims = mean.shape[0]
-    am_mvn = os.path.dirname(cmvn_file) + "/am.mvn"
-    with open(am_mvn, "w") as fout:
-        fout.write(
-            "<Nnet>"
-            + "\n"
-            + "<Splice> "
-            + str(dims)
-            + " "
-            + str(dims)
-            + "\n"
-            + "[ 0 ]"
-            + "\n"
-            + "<AddShift> "
-            + str(dims)
-            + " "
-            + str(dims)
-            + "\n"
-        )
-        mean_str = (
-            str(list(mean)).replace(",", "").replace("[", "[ ").replace("]", " ]")
-        )
-        fout.write("<LearnRateCoef> 0 " + mean_str + "\n")
-        fout.write("<Rescale> " + str(dims) + " " + str(dims) + "\n")
-        var_str = str(list(var)).replace(",", "").replace("[", "[ ").replace("]", " ]")
-        fout.write("<LearnRateCoef> 0 " + var_str + "\n")
-        fout.write("</Nnet>" + "\n")
-"""
-python funasr/bin/compute_audio_cmvn.py \
---config-path "/Users/zhifu/funasr1.0/examples/aishell/paraformer/conf" \
---config-name "train_asr_paraformer_conformer_12e_6d_2048_256.yaml" \
-++train_data_set_list="/Users/zhifu/funasr1.0/data/list/audio_datasets.jsonl" \
-++cmvn_file="/Users/zhifu/funasr1.0/data/list/cmvn.json" \
-++dataset_conf.num_workers=0
-"""
-if __name__ == "__main__":
-    main_hydra()

funasr_detach/bin/inference.py DELETED Viewed

@@ -1,33 +0,0 @@
-import hydra
-import logging
-from omegaconf import DictConfig, OmegaConf, ListConfig
-from funasr_detach.auto.auto_model import AutoModel
-@hydra.main(config_name=None, version_base=None)
-def main_hydra(cfg: DictConfig):
-    def to_plain_list(cfg_item):
-        if isinstance(cfg_item, ListConfig):
-            return OmegaConf.to_container(cfg_item, resolve=True)
-        elif isinstance(cfg_item, DictConfig):
-            return {k: to_plain_list(v) for k, v in cfg_item.items()}
-        else:
-            return cfg_item
-    kwargs = to_plain_list(cfg)
-    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
-    logging.basicConfig(level=log_level)
-    if kwargs.get("debug", False):
-        import pdb
-        pdb.set_trace()
-    model = AutoModel(**kwargs)
-    res = model.generate(input=kwargs["input"])
-    print(res)
-if __name__ == "__main__":
-    main_hydra()

funasr_detach/bin/tokenize_text.py DELETED Viewed

@@ -1,281 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from collections import Counter
-import logging
-from pathlib import Path
-import sys
-from typing import List
-from typing import Optional
-from funasr_detach.utils.cli_utils import get_commandline_args
-from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
-from funasr_detach.tokenizer.cleaner import TextCleaner
-from funasr_detach.tokenizer.phoneme_tokenizer import g2p_classes
-from funasr_detach.utils.types import str2bool
-from funasr_detach.utils.types import str_or_none
-def field2slice(field: Optional[str]) -> slice:
-    """Convert field string to slice
-    Note that field string accepts 1-based integer.
-    Examples:
-        >>> field2slice("1-")
-        slice(0, None, None)
-        >>> field2slice("1-3")
-        slice(0, 3, None)
-        >>> field2slice("-3")
-        slice(None, 3, None)
-    """
-    field = field.strip()
-    try:
-        if "-" in field:
-            # e.g. "2-" or "2-5" or "-7"
-            s1, s2 = field.split("-", maxsplit=1)
-            if s1.strip() == "":
-                s1 = None
-            else:
-                s1 = int(s1)
-                if s1 == 0:
-                    raise ValueError("1-based string")
-            if s2.strip() == "":
-                s2 = None
-            else:
-                s2 = int(s2)
-        else:
-            # e.g. "2"
-            s1 = int(field)
-            s2 = s1 + 1
-            if s1 == 0:
-                raise ValueError("must be 1 or more value")
-    except ValueError:
-        raise RuntimeError(f"Format error: e.g. '2-', '2-5', or '-5': {field}")
-    if s1 is None:
-        slic = slice(None, s2)
-    else:
-        # -1 because of 1-based integer following "cut" command
-        # e.g "1-3" -> slice(0, 3)
-        slic = slice(s1 - 1, s2)
-    return slic
-def tokenize(
-    input: str,
-    output: str,
-    field: Optional[str],
-    delimiter: Optional[str],
-    token_type: str,
-    space_symbol: str,
-    non_linguistic_symbols: Optional[str],
-    bpemodel: Optional[str],
-    log_level: str,
-    write_vocabulary: bool,
-    vocabulary_size: int,
-    remove_non_linguistic_symbols: bool,
-    cutoff: int,
-    add_symbol: List[str],
-    cleaner: Optional[str],
-    g2p: Optional[str],
-):
-    logging.basicConfig(
-        level=log_level,
-        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-    )
-    if input == "-":
-        fin = sys.stdin
-    else:
-        fin = Path(input).open("r", encoding="utf-8")
-    if output == "-":
-        fout = sys.stdout
-    else:
-        p = Path(output)
-        p.parent.mkdir(parents=True, exist_ok=True)
-        fout = p.open("w", encoding="utf-8")
-    cleaner = TextCleaner(cleaner)
-    tokenizer = build_tokenizer(
-        token_type=token_type,
-        bpemodel=bpemodel,
-        delimiter=delimiter,
-        space_symbol=space_symbol,
-        non_linguistic_symbols=non_linguistic_symbols,
-        remove_non_linguistic_symbols=remove_non_linguistic_symbols,
-        g2p_type=g2p,
-    )
-    counter = Counter()
-    if field is not None:
-        field = field2slice(field)
-    for line in fin:
-        line = line.rstrip()
-        if field is not None:
-            # e.g. field="2-"
-            # uttidA hello world!! -> hello world!!
-            tokens = line.split(delimiter)
-            tokens = tokens[field]
-            if delimiter is None:
-                line = " ".join(tokens)
-            else:
-                line = delimiter.join(tokens)
-        line = cleaner(line)
-        tokens = tokenizer.text2tokens(line)
-        if not write_vocabulary:
-            fout.write(" ".join(tokens) + "\n")
-        else:
-            for t in tokens:
-                counter[t] += 1
-    if not write_vocabulary:
-        return
-    ## FIXME
-    ## del duplicate add_symbols in counter
-    for symbol_and_id in add_symbol:
-        # e.g symbol="<blank>:0"
-        try:
-            symbol, idx = symbol_and_id.split(":")
-        except ValueError:
-            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
-        symbol = symbol.strip()
-        if symbol in counter:
-            del counter[symbol]
-    # ======= write_vocabulary mode from here =======
-    # Sort by the number of occurrences in descending order
-    # and filter lower frequency words than cutoff value
-    words_and_counts = list(
-        filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))
-    )
-    # Restrict the vocabulary size
-    if vocabulary_size > 0:
-        if vocabulary_size < len(add_symbol):
-            raise RuntimeError(f"vocabulary_size is too small: {vocabulary_size}")
-        words_and_counts = words_and_counts[: vocabulary_size - len(add_symbol)]
-    # Parse the values of --add_symbol
-    for symbol_and_id in add_symbol:
-        # e.g symbol="<blank>:0"
-        try:
-            symbol, idx = symbol_and_id.split(":")
-            idx = int(idx)
-        except ValueError:
-            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
-        symbol = symbol.strip()
-        # e.g. idx=0  -> append as the first symbol
-        # e.g. idx=-1 -> append as the last symbol
-        if idx < 0:
-            idx = len(words_and_counts) + 1 + idx
-        words_and_counts.insert(idx, (symbol, None))
-    # Write words
-    for w, c in words_and_counts:
-        fout.write(w + "\n")
-    # Logging
-    total_count = sum(counter.values())
-    invocab_count = sum(c for w, c in words_and_counts if c is not None)
-    logging.info(f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
-def get_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        description="Tokenize texts",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--log_level",
-        type=lambda x: x.upper(),
-        default="INFO",
-        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
-        help="The verbose level of logging",
-    )
-    parser.add_argument(
-        "--input", "-i", required=True, help="Input text. - indicates sys.stdin"
-    )
-    parser.add_argument(
-        "--output", "-o", required=True, help="Output text. - indicates sys.stdout"
-    )
-    parser.add_argument(
-        "--field",
-        "-f",
-        help="The target columns of the input text as 1-based integer. e.g 2-",
-    )
-    parser.add_argument(
-        "--token_type",
-        "-t",
-        default="char",
-        choices=["char", "bpe", "word", "phn"],
-        help="Token type",
-    )
-    parser.add_argument("--delimiter", "-d", default=None, help="The delimiter")
-    parser.add_argument("--space_symbol", default="<space>", help="The space symbol")
-    parser.add_argument("--bpemodel", default=None, help="The bpemodel file path")
-    parser.add_argument(
-        "--non_linguistic_symbols",
-        type=str_or_none,
-        help="non_linguistic_symbols file path",
-    )
-    parser.add_argument(
-        "--remove_non_linguistic_symbols",
-        type=str2bool,
-        default=False,
-        help="Remove non-language-symbols from tokens",
-    )
-    parser.add_argument(
-        "--cleaner",
-        type=str_or_none,
-        choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
-        default=None,
-        help="Apply text cleaning",
-    )
-    parser.add_argument(
-        "--g2p",
-        type=str_or_none,
-        choices=g2p_classes,
-        default=None,
-        help="Specify g2p method if --token_type=phn",
-    )
-    group = parser.add_argument_group("write_vocabulary mode related")
-    group.add_argument(
-        "--write_vocabulary",
-        type=str2bool,
-        default=False,
-        help="Write tokens list instead of tokenized text per line",
-    )
-    group.add_argument("--vocabulary_size", type=int, default=0, help="Vocabulary size")
-    group.add_argument(
-        "--cutoff",
-        default=0,
-        type=int,
-        help="cut-off frequency used for write-vocabulary mode",
-    )
-    group.add_argument(
-        "--add_symbol",
-        type=str,
-        default=[],
-        action="append",
-        help="Append symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1'",
-    )
-    return parser
-def main(cmd=None):
-    print(get_commandline_args(), file=sys.stderr)
-    parser = get_parser()
-    args = parser.parse_args(cmd)
-    kwargs = vars(args)
-    tokenize(**kwargs)
-if __name__ == "__main__":
-    main()

funasr_detach/bin/train.py DELETED Viewed

@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-# -*- encoding: utf-8 -*-
-import os
-import sys
-import torch
-import hydra
-import logging
-import argparse
-from io import BytesIO
-import torch.distributed as dist
-from collections.abc import Sequence
-from omegaconf import DictConfig, OmegaConf
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from funasr_detach.register import tables
-from funasr_detach.optimizers import optim_classes
-from funasr_detach.train_utils.trainer import Trainer
-from funasr_detach.schedulers import scheduler_classes
-from funasr_detach.train_utils.initialize import initialize
-from funasr_detach.download.download_from_hub import download_model
-from funasr_detach.models.lora.utils import mark_only_lora_as_trainable
-from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
-from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
-# from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
-# from funasr_detach.tokenizer.token_id_converter import TokenIDConverter
-# from funasr_detach.tokenizer.funtoken import build_tokenizer
-@hydra.main(config_name=None, version_base=None)
-def main_hydra(kwargs: DictConfig):
-    if kwargs.get("debug", False):
-        import pdb
-        pdb.set_trace()
-    assert "model" in kwargs
-    if "model_conf" not in kwargs:
-        logging.info(
-            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
-        )
-        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
-    main(**kwargs)
-def main(**kwargs):
-    print(kwargs)
-    # set random seed
-    set_all_random_seed(kwargs.get("seed", 0))
-    torch.backends.cudnn.enabled = kwargs.get(
-        "cudnn_enabled", torch.backends.cudnn.enabled
-    )
-    torch.backends.cudnn.benchmark = kwargs.get(
-        "cudnn_benchmark", torch.backends.cudnn.benchmark
-    )
-    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    if local_rank == 0:
-        tables.print()
-    # Check if we are using DDP or FSDP
-    use_ddp = "WORLD_SIZE" in os.environ and int(os.environ["WORLD_SIZE"]) > 1
-    use_fsdp = kwargs.get("use_fsdp", None)
-    if use_ddp or use_fsdp:
-        dist.init_process_group(
-            backend=kwargs.get("backend", "nccl"), init_method="env://"
-        )
-        torch.cuda.set_device(local_rank)
-    # save config.yaml
-    if (
-        (use_ddp or use_fsdp)
-        and dist.get_rank() == 0
-        or not (use_ddp or use_fsdp)
-        and local_rank == 0
-    ):
-        os.makedirs(kwargs.get("output_dir", "./"), exist_ok=True)
-        yaml_file = os.path.join(kwargs.get("output_dir", "./"), "config.yaml")
-        OmegaConf.save(config=kwargs, f=yaml_file)
-        logging.info("config.yaml is saved to: %s", yaml_file)
-    tokenizer = kwargs.get("tokenizer", None)
-    if tokenizer is not None:
-        tokenizer_class = tables.tokenizer_classes.get(tokenizer)
-        tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
-        kwargs["tokenizer"] = tokenizer
-    # build frontend if frontend is none None
-    frontend = kwargs.get("frontend", None)
-    if frontend is not None:
-        frontend_class = tables.frontend_classes.get(frontend)
-        frontend = frontend_class(**kwargs["frontend_conf"])
-        kwargs["frontend"] = frontend
-        kwargs["input_size"] = frontend.output_size()
-    # build model
-    model_class = tables.model_classes.get(kwargs["model"])
-    model = model_class(
-        **kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list)
-    )
-    # init_param
-    init_param = kwargs.get("init_param", None)
-    if init_param is not None:
-        if not isinstance(init_param, (list, tuple)):
-            init_param = (init_param,)
-        logging.info("init_param is not None: %s", init_param)
-        for p in init_param:
-            logging.info(f"Loading pretrained params from {p}")
-            load_pretrained_model(
-                model=model,
-                path=p,
-                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
-                oss_bucket=kwargs.get("oss_bucket", None),
-                scope_map=kwargs.get("scope_map", None),
-                excludes=kwargs.get("excludes", None),
-            )
-    else:
-        initialize(model, kwargs.get("init", "kaiming_normal"))
-    # freeze_param
-    freeze_param = kwargs.get("freeze_param", None)
-    if freeze_param is not None:
-        freeze_param = eval(freeze_param)
-        if isinstance(freeze_param, Sequence):
-            freeze_param = (freeze_param,)
-        logging.info("freeze_param is not None: %s", freeze_param)
-        for t in freeze_param:
-            for k, p in model.named_parameters():
-                if k.startswith(t + ".") or k == t:
-                    logging.info(f"Setting {k}.requires_grad = False")
-                    p.requires_grad = False
-    if use_ddp:
-        model = model.cuda(local_rank)
-        model = DDP(
-            model,
-            device_ids=[local_rank],
-            find_unused_parameters=kwargs.get("train_conf", {}).get(
-                "find_unused_parameters", False
-            ),
-        )
-    elif use_fsdp:
-        model = FSDP(model).cuda(local_rank)
-    else:
-        model = model.to(device=kwargs.get("device", "cuda"))
-    # optim
-    optim = kwargs.get("optim", "adam")
-    assert optim in optim_classes
-    optim_class = optim_classes.get(optim)
-    optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
-    # scheduler
-    scheduler = kwargs.get("scheduler", "warmuplr")
-    assert scheduler in scheduler_classes
-    scheduler_class = scheduler_classes.get(scheduler)
-    scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
-    # dataset
-    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
-    dataset_tr = dataset_class(
-        kwargs.get("train_data_set_list"),
-        frontend=frontend,
-        tokenizer=tokenizer,
-        is_training=True,
-        **kwargs.get("dataset_conf"),
-    )
-    dataset_val = dataset_class(
-        kwargs.get("valid_data_set_list"),
-        frontend=frontend,
-        tokenizer=tokenizer,
-        is_training=False,
-        **kwargs.get("dataset_conf"),
-    )
-    # dataloader
-    batch_sampler = kwargs["dataset_conf"].get(
-        "batch_sampler", "DynamicBatchLocalShuffleSampler"
-    )
-    batch_sampler_val = None
-    if batch_sampler is not None:
-        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
-        batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf"))
-        batch_sampler_val = batch_sampler_class(
-            dataset_val, is_training=False, **kwargs.get("dataset_conf")
-        )
-    dataloader_tr = torch.utils.data.DataLoader(
-        dataset_tr,
-        collate_fn=dataset_tr.collator,
-        batch_sampler=batch_sampler,
-        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
-        pin_memory=True,
-    )
-    dataloader_val = torch.utils.data.DataLoader(
-        dataset_val,
-        collate_fn=dataset_val.collator,
-        batch_sampler=batch_sampler_val,
-        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
-        pin_memory=True,
-    )
-    trainer = Trainer(
-        model=model,
-        optim=optim,
-        scheduler=scheduler,
-        dataloader_train=dataloader_tr,
-        dataloader_val=dataloader_val,
-        local_rank=local_rank,
-        use_ddp=use_ddp,
-        use_fsdp=use_fsdp,
-        output_dir=kwargs.get("output_dir", "./exp"),
-        resume=kwargs.get("resume", True),
-        **kwargs.get("train_conf"),
-    )
-    trainer.run()
-    if use_ddp or use_fsdp:
-        torch.distributed.destroy_process_group()
-if __name__ == "__main__":
-    main_hydra()