m-a-p
/

xcodec_mini_infer

Model card Files Files and versions Community

agent404 commited on 14 days ago

Commit

fe781a6

1 Parent(s): f090575

upload files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

RepCodec/.gitignore +160 -0
RepCodec/LICENSE +428 -0
RepCodec/README.md +273 -0
RepCodec/examples/data2vec_audio.py +541 -0
RepCodec/examples/data2vec_feature_reader.py +87 -0
RepCodec/examples/dump_feature.py +142 -0
RepCodec/examples/feature_utils.py +70 -0
RepCodec/examples/hubert_feature_reader.py +64 -0
RepCodec/examples/tokens/data2vec_base_l6_dev-clean.tokens +0 -0
RepCodec/examples/tokens/data2vec_large_l18_dev-clean.tokens +0 -0
RepCodec/examples/tokens/hubert_base_l9_dev-clean.tokens +0 -0
RepCodec/examples/tokens/hubert_large_l18_dev-clean.tokens +0 -0
RepCodec/examples/tokens/whisper_large_l32_dev-clean.tokens +0 -0
RepCodec/examples/tokens/whisper_medium_l24_dev-clean.tokens +0 -0
RepCodec/examples/whisper_feature_reader.py +110 -0
RepCodec/examples/whisper_model.py +58 -0
RepCodec/repcodec/RepCodec.py +84 -0
RepCodec/repcodec/configs/repcodec_dim1024.yaml +18 -0
RepCodec/repcodec/configs/repcodec_dim1280.yaml +18 -0
RepCodec/repcodec/configs/repcodec_dim768.yaml +18 -0
RepCodec/repcodec/layers/conv_layer.py +95 -0
RepCodec/repcodec/layers/vq_module.py +155 -0
RepCodec/repcodec/modules/decoder.py +109 -0
RepCodec/repcodec/modules/encoder.py +89 -0
RepCodec/repcodec/modules/projector.py +32 -0
RepCodec/repcodec/modules/quantizer.py +46 -0
RepCodec/repcodec/modules/residual_unit.py +39 -0
RepCodec/repcodec/tokenize.py +212 -0
RepCodec/setup.py +31 -0
RepCodec/train.py +228 -0
RepCodec/train_configs/ex_dim768_mse.yaml +74 -0
RepCodec/trainer/autoencoder.py +287 -0
__pycache__/post_process_audio.cpython-310.pyc +0 -0
__pycache__/vocoder.cpython-310.pyc +0 -0
decoders/config.yaml +15 -0
decoders/decoder_131000.pth +3 -0
decoders/decoder_151000.pth +3 -0
descriptaudiocodec/dac/__init__.py +16 -0
descriptaudiocodec/dac/__main__.py +36 -0
descriptaudiocodec/dac/__pycache__/__init__.cpython-310.pyc +0 -0
descriptaudiocodec/dac/__pycache__/__init__.cpython-38.pyc +0 -0
descriptaudiocodec/dac/__pycache__/__init__.cpython-39.pyc +0 -0
descriptaudiocodec/dac/compare/__init__.py +0 -0
descriptaudiocodec/dac/compare/encodec.py +54 -0
descriptaudiocodec/dac/model/__init__.py +4 -0
descriptaudiocodec/dac/model/__pycache__/__init__.cpython-310.pyc +0 -0
descriptaudiocodec/dac/model/__pycache__/__init__.cpython-39.pyc +0 -0
descriptaudiocodec/dac/model/__pycache__/base.cpython-310.pyc +0 -0
descriptaudiocodec/dac/model/__pycache__/base.cpython-39.pyc +0 -0
descriptaudiocodec/dac/model/__pycache__/dac.cpython-310.pyc +0 -0

RepCodec/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

RepCodec/LICENSE ADDED Viewed

	@@ -0,0 +1,428 @@

+MIT License
+Copyright (c) ByteDance, Inc. and its affiliates.
+Copyright (c) Chutong Meng
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

RepCodec/README.md ADDED Viewed

	@@ -0,0 +1,273 @@

+# RepCodec: A Speech Representation Codec for Speech Tokenization
+> [**RepCodec: A Speech Representation Codec for Speech Tokenization**](https://arxiv.org/abs/2309.00169)
+## Introduction
+**RepCodec** is a speech tokenization method for converting a speech waveform into a sequence of discrete semantic
+tokens.
+The main idea is to train a representation codec which learns a vector quantization codebook through reconstructing the
+input speech representations from speech encoders like HuBERT or data2vec.
+Extensive experiments show that RepCodec significantly outperforms the widely used k-means clustering approach in both
+speech understanding and generation.
+Also, RepCodec generalizes well across various speech encoders and languages.
+<img src="images/RepCodec.png" alt="se" width="1000" />
+## RepCodec Models
+| Feature Type                                                                                                          | Speech Data                                              | RepCodec Model                                                                                           |
+|-----------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
+| [HuBERT base](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models) layer 9     | [Librispeech](http://www.openslr.org/12) train-clean-100 | [hubert_base_l9](https://drive.google.com/file/d/1XD0HKl607FFjri2-VJT7lHQeSpxsCCFO/view?usp=sharing)     |
+| [HuBERT large](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models) layer 18   | [Librispeech](http://www.openslr.org/12) train-clean-100 | [hubert_large_l18](https://drive.google.com/file/d/1mTbm5GeJ7gp_5L3QLP-JGXdf8RnRw5n6/view?usp=sharing)   |
+| [data2vec base](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2) layer 6   | [Librispeech](http://www.openslr.org/12) train-clean-100 | [data2vec_base_l6](https://drive.google.com/file/d/1d8sf3Ko_fYM9zlaiwxK_4xusLRKV5EMd/view?usp=sharing)   |
+| [data2vec large](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2) layer 18 | [Librispeech](http://www.openslr.org/12) train-clean-100 | [data2vec_large_l18](https://drive.google.com/file/d/1nuRIHaejT-uVi4cluftbT8o_JZqar5SU/view?usp=sharing) |
+| [Whisper medium](https://github.com/openai/whisper/tree/main#available-models-and-languages) layer 24                 | [Librispeech](http://www.openslr.org/12) train-clean-100 | [whisper_medium_l24](https://drive.google.com/file/d/1V6YJSA2V4iywXrecJAN0oqsa3aHowexZ/view?usp=sharing) |
+| [Whisper large-v2](https://github.com/openai/whisper/tree/main#available-models-and-languages) layer 32                                                                                         | [Librispeech](http://www.openslr.org/12) train-clean-100 | [whisper_large_l32](https://drive.google.com/file/d/1k_X7ZMPg8iOeDrIJe70v6CHfFygzufXC/view?usp=sharing)  |
+## Speech Tokenization Using Pre-Trained Models
+### Installation
+Please first install RepCodec by
+```
+git clone https://github.com/mct10/RepCodec.git
+cd RepCodec
+pip install .
+```
+We used Python 3.9.18 and PyTorch 1.12.1 to test the usage, but the code should be compatible with other recent Python
+and PyTorch versions.
+### Representation Preparation
+We adapt the `dump_hubert_feature.py` script
+from [fairseq](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert/simple_kmeans#hubert-feature)
+to support dumping representations from **data2vec**, **HuBERT**, or **Whisper** encoders.
+If you use our script (`examples/dump_feature.py`), please also install the following packages:
+```
+pip install npy_append_array soundfile
+```
+Additionally, if you want to dump representations from
+- **data2vec** or **HuBERT**: please
+  follow [fairseq's instruction](https://github.com/facebookresearch/fairseq#requirements-and-installation) to install
+  the latest fairseq.
+- **Whisper**: please follow [Whispers'instruction](https://github.com/openai/whisper/tree/main#setup) to install the
+  latest
+  Whisper.
+Then, you can follow the given examples to dump representations:
+```
+# Example 1: dump from HuBERT base layer 9
+# (for data2vec, simply change "model_type" to data2vec and "ckpt_path" to the path of data2vec model)
+layer=9
+python3 examples/dump_feature.py \
+    --model_type hubert \
+    --tsv_path /path/to/tsv/file \
+    --ckpt_path /path/to/HuBERT/model  \
+    --layer ${layer} \
+    --feat_dir /dir/to/save/representations
+# Example 2: dump from Whisper medium layer 24
+layer=24
+python3 examples/dump_feature.py \
+    --model_type whisper \
+    --tsv_path /path/to/tsv/file \
+    --whisper_root /directory/to/save/whisper/model \
+    --whisper_name medium \
+    --layer ${layer} \
+    --feat_dir /dir/to/save/representations
+```
+Explanations about the args:
+- **model_type:** choose from `data2vec`, `hubert`, and `whisper`.
+- **tsv_path:** path of the tsv file.
+  Should have the format of
+```
+/dir/to/dataset
+path_of_utterance_1 number_of_frames
+path_of_utterance_2 number_of_frames
+```
+You can follow [this script](https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py)
+to generate the tsv file.
+For example, by running
+```
+python wav2vec_manifest.py \
+  /dir/to/LibriSpeech/dev-clean \
+  --dest /dir/to/manifest \
+  --ext flac \
+  --valid-percent 0
+```
+you can obtain the `dev-clean.tsv` in `/dir/to/manifest` for LibriSpeech. (By default, the output file name
+is `train.tsv`. Remember to rename the file.)
+It should be similar to:
+```
+/dir/to/LibriSpeech/dev-clean
+2277/149896/2277-149896-0026.flac	78720
+2277/149896/2277-149896-0005.flac	89600
+2277/149896/2277-149896-0033.flac	45520
+```
+- **ckpt_path**:
+  must provide for data2vec and HuBERT.
+  You need to download the model
+  from [data2vec website](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2)
+  or [HuBERT website](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models)
+  yourself.
+  `--ckpt_path` is the path of the data2vec/HuBERT model.
+- **whisper_root** and  **whisper_name**:
+  must provide **BOTH** `--whisper_root` and `--whisper_name` for Whisper.
+  If there is no corresponding model in `--whisper_root`, the script will download for you.
+- **layer**:
+  which Transformer encoder layer of the model should the representations be extracted from.
+  It is **1-based**.
+  For example, if layer=9, then the outputs from the 9<sup>th</sup> Transformer encoder layer are dumped.
+  Range: [1, number of Transformer encoder layers]
+- **feat_dir**: The output representations will be saved to `${feat_dir}/0_1.npy`
+  and `${feat_dir}/0_1.len`.
+For other useful functionalities (e.g., sharding), please check the argument list in `examples/dump_feature.py`.
+### Command Line Usage
+We expect to have `${feat_dir}/0_1.npy` and `${feat_dir}/0_1.len` in the provided
+directory `/dir/to/representaitons`.
+Also, the tsv file should be the **same** as the one used in [Representation Preparation](#representation-preparation).
+```
+repcodec /dir/to/representaitons \
+    --model /path/to/repcodec/model \
+    --tsv_path /path/to/tsv/file \
+    [--model_config_path /path/to/train/config] \
+    [--use_gpu] \
+    [--out_dir /path/to/output]
+```
+If you trained the model yourself following [Training New RepCodec Models](#training-new-repcodec-models),
+please provide the training config file using `--model_config_path`.
+If you use the model we provide [here](#repcodec-models), then you do not have to provide that.
+This command will tokenize the representations and the output discrete tokens will be saved to `${out_dir}/tokens`.
+The tokens are in the same order as the provided tsv file.
+An example of the output file:
+```
+/dir/to/LibriSpeech/dev-clean
+2277/149896/2277-149896-0026.flac	696 696 198 198 198 498 ...
+2277/149896/2277-149896-0005.flac	696 696 198 198 198 907 ...
+2277/149896/2277-149896-0033.flac	696 696 198 198 198 696 ...
+```
+Under `examples/tokens`, we provide some token files as references. They are obtained from LibriSpeech dev-clean subset
+using the 6 types of representations and corresponding [RepCodec Models](#repcodec-models).
+Your results should be very similar to ours.
+### Python Usage
+```python
+import torch
+import yaml
+from repcodec.RepCodec import RepCodec
+# for feature types of HubERT base & data2vec base, please use repcodec_dim768.yaml;
+# for feature types of HuBERT large & data2vec large & Whisper medium, please use repcodec_dim1024.yaml;
+# for feature types of Whisper large-v2, please use repcodec_dim1280.yaml
+config = "repcodec/configs/repcodec_dim768.yaml"
+with open(config) as fp:
+    conf = yaml.load(fp, Loader=yaml.FullLoader)
+model = RepCodec(**conf)
+model.load_state_dict(torch.load("./hubert_base_l9.pkl", map_location="cpu")["model"]["repcodec"])
+model.quantizer.initial()
+model.eval()
+# input shape: (batch size, hidden dim, sequence length)
+random_features = torch.randn(size=(1, 768, 100))
+with torch.no_grad():
+    x = model.encoder(random_features)
+    z = model.projector(x)
+    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
+    tokens = idx.cpu().data.numpy().tolist()[0]
+```
+## Training New RepCodec Models
+We use a config file to set up all the training configurations, e.g., data, model architecture,
+optimizer, scheduler.
+We provide an example [here](./train_configs/ex_dim768_mse.yaml).
+Please first install required packages following [Installation](#installation)
+and prepare the representations following [Representation Preparation](#representation-preparation).
+The input data directory is expected to have the following structure
+```
+/dir/to/representations/
+  train_set_name/
+    0_1.npy
+    0_1.len
+  valid_set_name/
+    0_1.npy
+    0_1.len
+  test_set_name/
+    0_1.npy
+    0_1.len
+```
+The names of subsets should be the same as the fields in the config file.
+Then, you can run training by
+```
+python train.py \
+  -c /path/to/config/file \
+  --tag $tag \
+  --exp_root exp
+```
+`tag` is the name of the output folder.
+All outputs will be saved to `exp_root/tag/`.
+## Acknowledge
+Our implementation is based on [facebookresearch/AudioDec](https://github.com/facebookresearch/AudioDec).
+We thank them for open-sourcing their code!
+## Citation
+If you find our work useful, please cite the following article.
+```
+@misc{huang2023repcodec,
+      title={RepCodec: A Speech Representation Codec for Speech Tokenization},
+      author={Zhichao Huang and Chutong Meng and Tom Ko},
+      year={2023},
+      eprint={2309.00169},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS}
+}
+```

RepCodec/examples/data2vec_audio.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import II
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from fairseq.modules import EMAModule, EMAModuleConfig
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    ConvFeatureExtractionModel,
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+from fairseq.modules import (
+    GradMultiply,
+    LayerNorm,
+)
+from fairseq.utils import index_put
+logger = logging.getLogger(__name__)
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+    max_update: int = II("optimization.max_update")
+    min_target_var: float = field(
+        default=0.1, metadata={"help": "stop training if target var falls below this"}
+    )
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+        self.final_proj = nn.Linear(self.embed, self.embed)
+        self.num_updates = 0
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        if self.ema is None and self.final_proj is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+        self.num_updates = num_updates
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+        return x, mask_indices
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+        return input_lengths.to(torch.long)
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        orig_padding_mask = padding_mask
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+        features = self.dropout_input(features)
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        with torch.no_grad():
+            self.ema.model.eval()
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+            target_layer_results = [l[2] for l in y["layer_results"]]
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
+                ]
+                permuted = True
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in target_layer_results
+                ]
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [
+                    F.instance_norm(tl.float()) for tl in target_layer_results
+                ]
+            if permuted:
+                target_layer_results = [
+                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
+                ]
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-2:])
+                    for tl in target_layer_results
+                ]
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-1:])
+                    for tl in target_layer_results
+                ]
+            y = sum(target_layer_results) / len(target_layer_results)
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+            if not permuted:
+                y = y.transpose(0, 1)
+            y = y[mask_indices]
+        x = x[mask_indices]
+        x = self.final_proj(x)
+        sz = x.size(-1)
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.loss_beta
+            ).sum(dim=-1)
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+        result["losses"]["regression"] = loss.sum() * scale
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+            raise Exception(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+            raise Exception(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )

RepCodec/examples/data2vec_feature_reader.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import torch
+import torch.nn.functional as F
+from fairseq import tasks
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from omegaconf import OmegaConf
+from data2vec_audio import Data2VecAudioModel
+logger = logging.getLogger("dump_feature")
+class Data2vecFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        state = load_checkpoint_to_cpu(ckpt_path)
+        cfg = state["cfg"]
+        # load task
+        task = tasks.setup_task(cfg.task, from_checkpoint=True)
+        task.load_state_dict(state["task_state"])
+        # load model config
+        if "layer_type" not in cfg.model:
+            # fix a missing key
+            model_config = {k: v for k, v in cfg.model.items()}
+            model_config["layer_type"] = "transformer"
+            model_config = OmegaConf.create(model_config)
+        else:
+            model_config = cfg.model
+        # fix param name in the state
+        state["model"]["final_proj.weight"] = state["model"].pop("final_proj.0.weight")
+        state["model"]["final_proj.bias"] = state["model"].pop("final_proj.0.bias")
+        del state["model"]["_ema"]
+        # load model
+        model = Data2VecAudioModel.build_model(model_config)
+        model.load_state_dict(
+            state["model"], strict=True, model_cfg=model_config
+        )
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model.eval().to(self.device)
+        self.task = task
+        self.layer = layer - 1  # make it 1-based
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

RepCodec/examples/dump_feature.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import os
+import sys
+from feature_utils import get_path_iterator, dump_feature
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_feature")
+def main(
+        model_type: str,
+        tsv_path: str,
+        ckpt_path: str,
+        whisper_root: str,
+        whisper_name: str,
+        layer: int,
+        nshard: int,
+        rank: int,
+        feat_dir: str,
+        max_chunk: int,
+        use_cpu: bool = False
+):
+    device = "cpu" if use_cpu else "cuda"
+    # some checks
+    if model_type in ["hubert", "data2vec"]:
+        assert ckpt_path and os.path.exists(ckpt_path)
+    elif model_type in ["whisper"]:
+        assert whisper_name and whisper_root
+    else:
+        raise ValueError(f"Unsupported model type {model_type}")
+    reader = None
+    if model_type == "hubert":
+        from hubert_feature_reader import HubertFeatureReader
+        reader = HubertFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "data2vec":
+        from data2vec_feature_reader import Data2vecFeatureReader
+        reader = Data2vecFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "whisper":
+        from whisper_feature_reader import WhisperFeatureReader
+        reader = WhisperFeatureReader(whisper_root, whisper_name, layer, device=device)
+    assert reader is not None
+    generator, num = get_path_iterator(tsv_path, nshard, rank)
+    dump_feature(reader, generator, num, nshard, rank, feat_dir)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        required=True,
+        type=str,
+        choices=["data2vec", "hubert", "whisper"],
+        help="the type of the speech encoder."
+    )
+    parser.add_argument(
+        "--tsv_path",
+        required=True,
+        type=str,
+        help="the path to the tsv file."
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        required=False,
+        type=str,
+        default=None,
+        help="path to the speech model. must provide for HuBERT and data2vec"
+    )
+    parser.add_argument(
+        "--whisper_root",
+        required=False,
+        type=str,
+        default=None,
+        help="root dir to download/store whisper model. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--whisper_name",
+        required=False,
+        type=str,
+        default=None,
+        help="name of whisper model. e.g., large-v2. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--layer",
+        required=True,
+        type=int,
+        help="which layer of the model. this is 1-based."
+    )
+    parser.add_argument(
+        "--feat_dir",
+        required=True,
+        type=str,
+        help="the output dir to save the representations."
+    )
+    parser.add_argument(
+        "--nshard",
+        required=False,
+        type=int,
+        default=1,
+        help="total number of shards."
+    )
+    parser.add_argument(
+        "--rank",
+        required=False,
+        type=int,
+        default=0,
+        help="shard id of this process."
+    )
+    parser.add_argument(
+        "--max_chunk",
+        type=int,
+        default=1600000,
+        help="max number of frames of each batch."
+    )
+    parser.add_argument(
+        "--use_cpu",
+        default=False,
+        action="store_true",
+        help="whether use cpu instead of gpu."
+    )
+    args = parser.parse_args()
+    logger.info(args)
+    main(**vars(args))

RepCodec/examples/feature_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/hubert/simple_kmeans/feature_utils.py
+import logging
+import os
+import sys
+import tqdm
+from npy_append_array import NpyAppendArray
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+def get_shard_range(tot, nshard, rank):
+    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+    start = round(tot / nshard * rank)
+    end = round(tot / nshard * (rank + 1))
+    assert start < end, f"start={start}, end={end}"
+    logger.info(
+        f"rank {rank} of {nshard}, process {end-start} "
+        f"({start}-{end}) out of {tot}"
+    )
+    return start, end
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        start, end = get_shard_range(len(lines), nshard, rank)
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                subpath, nsample = line.split("\t")
+                yield f"{root}/{subpath}", int(nsample)
+    return iterate, len(lines)
+def dump_feature(reader, generator, num, nshard, rank, feat_dir):
+    iterator = generator()
+    feat_path = f"{feat_dir}/{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{rank}_{nshard}.len"
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")

RepCodec/examples/hubert_feature_reader.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import fairseq
+import torch
+import torch.nn.functional as F
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+logger = logging.getLogger("dump_feature")
+class HubertFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model[0].eval().to(self.device)
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                feat_chunk, _ = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

RepCodec/examples/tokens/data2vec_base_l6_dev-clean.tokens ADDED Viewed