camenduru commited on
Commit
7934b29
·
1 Parent(s): 6f6b1ba

thanks to NVIDIA ❤

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +19 -0
  2. .gitattributes +11 -0
  3. .github/ISSUE_TEMPLATE/bug_report.md +42 -0
  4. .github/ISSUE_TEMPLATE/feature_request.md +25 -0
  5. .github/PULL_REQUEST_TEMPLATE.md +39 -0
  6. .github/labeler.yml +33 -0
  7. .github/workflows/blossom-ci.yml +104 -0
  8. .github/workflows/changelog-build.yml +47 -0
  9. .github/workflows/cherry-pick-release-commit.yml +28 -0
  10. .github/workflows/close-inactive-issue-pr.yml +25 -0
  11. .github/workflows/codeql.yml +74 -0
  12. .github/workflows/config/changelog-config.json +134 -0
  13. .github/workflows/gh-docs.yml +38 -0
  14. .github/workflows/import-test.yml +63 -0
  15. .github/workflows/labeler.yml +14 -0
  16. .gitignore +181 -0
  17. .pre-commit-config.yaml +47 -0
  18. .readthedocs.yml +31 -0
  19. CITATION.cff +41 -0
  20. CONTRIBUTING.md +79 -0
  21. Dockerfile +127 -0
  22. Jenkinsfile +0 -0
  23. LICENSE +201 -0
  24. PUBLICATIONS.md +213 -0
  25. README.rst +319 -0
  26. ci.groovy +119 -0
  27. docs/.nojekyll +0 -0
  28. docs/Makefile +216 -0
  29. docs/source/_static/css/custom.css +298 -0
  30. docs/source/_static/js/pk_scripts.js +19 -0
  31. docs/source/_templates/layout.html +14 -0
  32. docs/source/asr/api.rst +299 -0
  33. docs/source/asr/asr_all.bib +1043 -0
  34. docs/source/asr/asr_language_modeling.rst +395 -0
  35. docs/source/asr/configs.rst +929 -0
  36. docs/source/asr/data/asrlm_results.csv +2 -0
  37. docs/source/asr/data/benchmark_ca.csv +4 -0
  38. docs/source/asr/data/benchmark_de.csv +6 -0
  39. docs/source/asr/data/benchmark_en.csv +28 -0
  40. docs/source/asr/data/benchmark_es.csv +7 -0
  41. docs/source/asr/data/benchmark_fr.csv +8 -0
  42. docs/source/asr/data/benchmark_hi.csv +2 -0
  43. docs/source/asr/data/benchmark_hr.csv +3 -0
  44. docs/source/asr/data/benchmark_it.csv +3 -0
  45. docs/source/asr/data/benchmark_kab.csv +2 -0
  46. docs/source/asr/data/benchmark_mr.csv +3 -0
  47. docs/source/asr/data/benchmark_pl.csv +2 -0
  48. docs/source/asr/data/benchmark_ru.csv +3 -0
  49. docs/source/asr/data/benchmark_rw.csv +3 -0
  50. docs/source/asr/data/benchmark_zh.csv +4 -0
.dockerignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env
7
+ pip-log.txt
8
+ pip-delete-this-directory.txt
9
+ .tox
10
+ .coverage
11
+ .coverage.*
12
+ .cache
13
+ nosetests.xml
14
+ coverage.xml
15
+ *,cover
16
+ *.log
17
+ .git
18
+ **/*.nemo
19
+ **/*.ckpt
.gitattributes CHANGED
@@ -32,3 +32,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ docs/source/nlp/dialogue_UML.png filter=lfs diff=lfs merge=lfs -text
36
+ docs/source/nlp/nemo_megatron/images/ddp.gif filter=lfs diff=lfs merge=lfs -text
37
+ docs/source/nlp/nemo_megatron/images/pnom.gif filter=lfs diff=lfs merge=lfs -text
38
+ docs/source/nlp/nemo_megatron/images/pp.gif filter=lfs diff=lfs merge=lfs -text
39
+ docs/source/nlp/nemo_megatron/images/tp.gif filter=lfs diff=lfs merge=lfs -text
40
+ docs/source/tts/images/fastpitch_model.png filter=lfs diff=lfs merge=lfs -text
41
+ docs/source/tts/images/radaligner_model.png filter=lfs diff=lfs merge=lfs -text
42
+ docs/source/tts/images/tacotron2_model.png filter=lfs diff=lfs merge=lfs -text
43
+ docs/source/tts/images/waveglow_model.png filter=lfs diff=lfs merge=lfs -text
44
+ examples/nlp/language_modeling/nemo_2b_bf16_tp1.nemo filter=lfs diff=lfs merge=lfs -text
45
+ tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+
12
+ A clear and concise description of what the bug is.
13
+
14
+ **Steps/Code to reproduce bug**
15
+
16
+ Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
17
+
18
+ A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
19
+
20
+
21
+ **Expected behavior**
22
+
23
+ A clear and concise description of what you expected to happen.
24
+
25
+ **Environment overview (please complete the following information)**
26
+
27
+ - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider - AWS, Azure, GCP, Collab)]
28
+ - Method of NeMo install: [pip install or from source]. Please specify exact commands you used to install.
29
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
30
+
31
+ **Environment details**
32
+
33
+ If NVIDIA docker image is used you don't need to specify these.
34
+ Otherwise, please provide:
35
+ - OS version
36
+ - PyTorch version
37
+ - Python version
38
+
39
+ **Additional context**
40
+
41
+ Add any other context about the problem here.
42
+ Example: GPU model
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: feature request
6
+ assignees: okuchaiev
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+
12
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13
+
14
+ **Describe the solution you'd like**
15
+
16
+ A clear and concise description of what you want to happen.
17
+ Provide a code snippet on how new APIs/changes would be used by others.
18
+
19
+ **Describe alternatives you've considered**
20
+
21
+ A clear and concise description of any alternative solutions or features you've considered.
22
+
23
+ **Additional context**
24
+
25
+ Add any other context or screenshots about the feature request here.
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # What does this PR do ?
2
+
3
+ Add a one line overview of what this PR aims to accomplish.
4
+
5
+ **Collection**: [Note which collection this PR will affect]
6
+
7
+ # Changelog
8
+ - Add specific line by line info of high level changes in this PR.
9
+
10
+ # Usage
11
+ * You can potentially add a usage example below
12
+
13
+ ```python
14
+ # Add a code snippet demonstrating how to use this
15
+ ```
16
+
17
+ # Before your PR is "Ready for review"
18
+ **Pre checks**:
19
+ - [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
20
+ - [ ] Did you write any new necessary tests?
21
+ - [ ] Did you add or update any necessary documentation?
22
+ - [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc)
23
+ - [ ] Reviewer: Does the PR have correct import guards for all optional libraries?
24
+
25
+ **PR Type**:
26
+ - [ ] New Feature
27
+ - [ ] Bugfix
28
+ - [ ] Documentation
29
+
30
+ If you haven't finished some of the above items you can still open "Draft" PR.
31
+
32
+
33
+ ## Who can review?
34
+
35
+ Anyone in the community is free to review the PR once the checks have passed.
36
+ [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
37
+
38
+ # Additional Information
39
+ * Related to # (issue)
.github/labeler.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASR:
2
+ - nemo/collections/asr/**/*
3
+ - examples/asr/**/*
4
+ - tutorials/asr/**/*
5
+ - docs/source/asr/**/*
6
+
7
+ NLP:
8
+ - nemo/collections/nlp/**/*
9
+ - examples/nlp/**/*
10
+ - tutorials/nlp/**/*
11
+ - docs/source/nlp/**/*
12
+
13
+ Speaker Tasks:
14
+ - examples/speaker_tasks/**/*
15
+ - tutorials/speaker_tasks/**/*
16
+
17
+ TTS:
18
+ - nemo/collections/tts/**/*
19
+ - examples/tts/**/*
20
+ - tutorials/tts/**/*
21
+ - docs/source/tts/**/*
22
+
23
+ core:
24
+ - nemo/core/**/*
25
+
26
+ common:
27
+ - nemo/collections/common/**/*
28
+
29
+ CI:
30
+ - .github/**/*
31
+ - Jenkinsfile
32
+ - Dockerfile
33
+ - ci.groovy
.github/workflows/blossom-ci.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # A workflow to trigger ci on hybrid infra (github + self hosted runner)
16
+ name: Blossom-CI
17
+ on:
18
+ issue_comment:
19
+ types: [created]
20
+ workflow_dispatch:
21
+ inputs:
22
+ platform:
23
+ description: 'runs-on argument'
24
+ required: false
25
+ args:
26
+ description: 'argument'
27
+ required: false
28
+ jobs:
29
+ Authorization:
30
+ name: Authorization
31
+ runs-on: blossom
32
+ outputs:
33
+ args: ${{ env.args }}
34
+
35
+ # This job only runs for pull request comments
36
+ if: |
37
+ contains( 'okuchaiev,ericharper,titu1994,MaximumEntropy,nithinraok,redoctopus,yidong72,SeanNaren,yzhang123,ekmb,arendu,', format('{0},', github.actor)) &&
38
+ github.event.comment.body == '/blossom-ci'
39
+ steps:
40
+ - name: Check if comment is issued by authorized person
41
+ run: blossom-ci
42
+ env:
43
+ OPERATION: 'AUTH'
44
+ REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
45
+ REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
46
+
47
+ Vulnerability-scan:
48
+ name: Vulnerability scan
49
+ needs: [Authorization]
50
+ runs-on: ubuntu-latest
51
+ steps:
52
+ - name: Checkout code
53
+ uses: actions/checkout@v2
54
+ with:
55
+ repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
56
+ ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
57
+ lfs: 'true'
58
+
59
+ # repo specific steps
60
+ #- name: Setup java
61
+ # uses: actions/setup-java@v1
62
+ # with:
63
+ # java-version: 1.8
64
+
65
+ # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
66
+ #- name: Setup blackduck properties
67
+ # run: |
68
+ # PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
69
+ # echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
70
+ # echo detect.maven.included.scopes=compile >> application.properties
71
+
72
+ - name: Run blossom action
73
+ uses: NVIDIA/blossom-action@main
74
+ env:
75
+ REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
76
+ REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
77
+ with:
78
+ args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
79
+ args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
80
+ args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
81
+
82
+ Job-trigger:
83
+ name: Start ci job
84
+ needs: [Vulnerability-scan]
85
+ runs-on: blossom
86
+ steps:
87
+ - name: Start ci job
88
+ run: blossom-ci
89
+ env:
90
+ OPERATION: 'START-CI-JOB'
91
+ CI_SERVER: ${{ secrets.CI_SERVER }}
92
+ REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93
+
94
+ Upload-Log:
95
+ name: Upload log
96
+ runs-on: blossom
97
+ if : github.event_name == 'workflow_dispatch'
98
+ steps:
99
+ - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
100
+ run: blossom-ci
101
+ env:
102
+ OPERATION: 'POST-PROCESSING'
103
+ CI_SERVER: ${{ secrets.CI_SERVER }}
104
+ REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
.github/workflows/changelog-build.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 'Changelog Build (Release)'
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - '*'
7
+
8
+ jobs:
9
+ changelog:
10
+ if: startsWith(github.ref, 'refs/tags/')
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ with:
15
+ fetch-depth: 0 # Required due to the way Git works, without it this action won't be able to find any or the correct tags
16
+
17
+ - name: Get Previous tag
18
+ id: previous_tag
19
+ # git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date
20
+ # awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag
21
+ # set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name
22
+ run: |
23
+ echo "::set-output name=tag_name::$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}')"
24
+ echo ${{ steps.previous_tag.outputs.tag_name }}
25
+
26
+ - name: Build Changelog
27
+ id: github_tag
28
+ uses: mikepenz/[email protected]
29
+ env:
30
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31
+ with:
32
+ # Configuration file is setup with filters for domains
33
+ # owner:repo must point to current repo
34
+ # fromTag: Auto resolved from historical tag order (previous tag compared to current tag)
35
+ # toTag: Current tag reference
36
+ configuration: ".github/workflows/config/changelog-config.json"
37
+ owner: "NVIDIA"
38
+ repo: "NeMo"
39
+ ignorePreReleases: "false"
40
+ failOnError: "false"
41
+ fromTag: ${{ steps.previous_tag.outputs.tag_name }}
42
+ toTag: ${{ github.ref_name }}
43
+
44
+ - name: Print Changelog
45
+ run: |
46
+ echo "${{steps.github_tag.outputs.changelog}}"
47
+ echo "--- DONE ---"
.github/workflows/cherry-pick-release-commit.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create PR to main with cherry-pick from release
2
+
3
+ on:
4
+ pull_request_target:
5
+ branches:
6
+ - 'r*.*.*'
7
+ types: ["closed"]
8
+
9
+ jobs:
10
+ cherry-pick-release-commit:
11
+ name: Cherry-pick release commit
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: Checkout
15
+ uses: actions/checkout@v3
16
+ with:
17
+ fetch-depth: 0
18
+ - name: github-cherry-pick-action v1.0.3
19
+ uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054
20
+ with:
21
+ branch: main
22
+ labels: |
23
+ cherry-pick
24
+ reviewers: |
25
+ ${{ github.event.pull_request.user.login }}
26
+
27
+ env:
28
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
.github/workflows/close-inactive-issue-pr.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Stale-Close-Inactive-Issues-PRs
2
+ on:
3
+ schedule:
4
+ - cron: "30 1 * * *"
5
+
6
+ jobs:
7
+ close-issues:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ issues: write
11
+ pull-requests: write
12
+ steps:
13
+ - uses: actions/stale@v6
14
+ with:
15
+ operations-per-run: 100
16
+ days-before-issue-stale: 30
17
+ days-before-issue-close: 7
18
+ stale-issue-label: "stale"
19
+ stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
20
+ close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
21
+ days-before-pr-stale: 14
22
+ days-before-pr-close: 7
23
+ stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. Remove stale label or comment or update or this will be closed in 7 days."
24
+ close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale."
25
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
.github/workflows/codeql.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For most projects, this workflow file will not need changing; you simply need
2
+ # to commit it to your repository.
3
+ #
4
+ # You may wish to alter this file to override the set of languages analyzed,
5
+ # or to provide custom queries or build logic.
6
+ #
7
+ # ******** NOTE ********
8
+ # We have attempted to detect the languages in your repository. Please check
9
+ # the `language` matrix defined below to confirm you have the correct set of
10
+ # supported CodeQL languages.
11
+ #
12
+ name: "CodeQL"
13
+
14
+ on:
15
+ push:
16
+ branches: [ "main", "[rv][0-9]*", "gh-pages-src" ]
17
+ pull_request:
18
+ # The branches below must be a subset of the branches above
19
+ branches: [ "main" ]
20
+ schedule:
21
+ - cron: '19 1 * * 4'
22
+
23
+ jobs:
24
+ analyze:
25
+ name: Analyze
26
+ runs-on: ubuntu-latest
27
+ permissions:
28
+ actions: read
29
+ contents: read
30
+ security-events: write
31
+
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ language: [ 'python' ]
36
+ # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37
+ # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38
+
39
+ steps:
40
+ - name: Checkout repository
41
+ uses: actions/checkout@v3
42
+
43
+ # Initializes the CodeQL tools for scanning.
44
+ - name: Initialize CodeQL
45
+ uses: github/codeql-action/init@v2
46
+ with:
47
+ languages: ${{ matrix.language }}
48
+ # If you wish to specify custom queries, you can do so here or in a config file.
49
+ # By default, queries listed here will override any specified in a config file.
50
+ # Prefix the list here with "+" to use these queries and those in the config file.
51
+
52
+ # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53
+ queries: security-and-quality # security-extended,
54
+
55
+
56
+ # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
57
+ # If this step fails, then you should remove it and run the build manually (see below)
58
+ - name: Autobuild
59
+ uses: github/codeql-action/autobuild@v2
60
+
61
+ # ℹ️ Command-line programs to run using the OS shell.
62
+ # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63
+
64
+ # If the Autobuild fails above, remove it and uncomment the following three lines.
65
+ # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66
+
67
+ # - run: |
68
+ # echo "Run, Build Application using script"
69
+ # ./location_of_script_within_repo/buildscript.sh
70
+
71
+ - name: Perform CodeQL Analysis
72
+ uses: github/codeql-action/analyze@v2
73
+ with:
74
+ category: "/language:${{matrix.language}}"
.github/workflows/config/changelog-config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "categories": [
3
+ {
4
+ "title": "## ASR \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
5
+ "labels": ["asr"],
6
+ "exclude_labels": ["cherry-pick"]
7
+ },
8
+ {
9
+ "title": "## TTS \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
10
+ "labels": ["tts"],
11
+ "exclude_labels": ["cherry-pick"]
12
+ },
13
+ {
14
+ "title": "## NLP / NMT \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
15
+ "labels": ["nlp", "nmt", "megatron"],
16
+ "exclude_labels": ["cherry-pick"]
17
+ },
18
+ {
19
+ "title": "## Text Normalization / Inverse Text Normalization \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
20
+ "labels": ["tn", "itn"],
21
+ "exclude_labels": ["cherry-pick"]
22
+ },
23
+ {
24
+ "title": "## NeMo Tools \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
25
+ "labels": ["tools"],
26
+ "exclude_labels": ["cherry-pick"]
27
+ },
28
+ {
29
+ "title": "## Export \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
30
+ "labels": ["export"],
31
+ "exclude_labels": ["cherry-pick"]
32
+ },
33
+ {
34
+ "title": "## Documentation \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
35
+ "labels": ["docs"],
36
+ "exclude_labels": ["cherry-pick"]
37
+ },
38
+ {
39
+ "title": "## Bugfixes \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
40
+ "labels": ["bug"],
41
+ "exclude_labels": ["cherry-pick"]
42
+ },
43
+ {
44
+ "title": "## Cherrypick \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
45
+ "labels": ["cherry-pick"],
46
+ "exclude_labels": ["cherry-pick"]
47
+ }
48
+ ],
49
+ "ignore_labels": [
50
+ "ignore"
51
+ ],
52
+ "sort": "ASC",
53
+ "template": "\n${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
54
+ "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
55
+ "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
56
+ "label_extractor": [
57
+ {
58
+ "pattern": "(.*tts.*)|(.*g2p.*)",
59
+ "target": "tts",
60
+ "flags": "gimu",
61
+ "on_property": ["title", "body"]
62
+ },
63
+ {
64
+ "pattern": "(.*asr.*)|(.*ctc.*)|(.*rnnt.*)|(.*transducer.*)|(.*dali.*)|(.*k2.*)",
65
+ "target": "asr",
66
+ "flags": "gimu",
67
+ "on_property": ["title", "body"]
68
+ },
69
+ {
70
+ "pattern": "(.*nlp.*)|(.*punctuation.*)|(.*capitalization.*)|(.*entity.*)|(.*glue.*)|(.*entity.*)|(.*retrieval.*)|(.*entity.*)|(.*intent.*)|(.*slot.*)|(.*entity.*)|(.*language.*)|(.*qa.*)|(.*token class.*)|(.*text class.*)",
71
+ "target": "nlp",
72
+ "flags": "gimu",
73
+ "on_property": ["title", "body"]
74
+ },
75
+ {
76
+ "pattern": "(.*nmt.*)|(.*bignlp.*)|(.*megatron.*)|(.*machine.*)|(.*translation.*)|(.*gpt.*)",
77
+ "target": "nmt",
78
+ "flags": "gimu",
79
+ "on_property": ["title", "body"]
80
+ },
81
+ {
82
+ "pattern": "(.*tn.*)|(.*itn.*)|(.*text norm.*)",
83
+ "target": "tn",
84
+ "flags": "gimu",
85
+ "on_property": ["title", "body"]
86
+ },
87
+ {
88
+ "pattern": "(.*sde.*)|(.*ctc segment.*)",
89
+ "target": "tools",
90
+ "flags": "gimu",
91
+ "on_property": ["title", "body"]
92
+ },
93
+ {
94
+ "pattern": "(.*trt.*)|(.*onnx.*)|(.*export.*)",
95
+ "target": "export",
96
+ "flags": "gimu",
97
+ "on_property": ["title", "body"]
98
+ },
99
+ {
100
+ "pattern": "(.*\\[x\\] Documentation.*)",
101
+ "target": "docs",
102
+ "flags": "gmu",
103
+ "on_property": ["title", "body"]
104
+ },
105
+ {
106
+ "pattern": "(.*\\[x\\] Bugfix.*)|(.*patch.*)",
107
+ "target": "bug",
108
+ "flags": "gmu",
109
+ "on_property": ["title", "body"]
110
+ },
111
+ {
112
+ "pattern": "(.*cherry-pick.*)|(.*cherrypick.*)",
113
+ "target": "cherrypick",
114
+ "flags": "gimu",
115
+ "on_property": ["title", "body"]
116
+ }
117
+ ],
118
+ "duplicate_filter": {
119
+ "pattern": ".+",
120
+ "on_property": "title",
121
+ "method": "match"
122
+ },
123
+ "transformers": [
124
+ ],
125
+ "max_tags_to_fetch": 100,
126
+ "max_pull_requests": 500,
127
+ "max_back_track_time_days": 365,
128
+ "exclude_merge_branches": [
129
+ ],
130
+ "tag_resolver": {
131
+ "method": "semver"
132
+ }
133
+ }
134
+
.github/workflows/gh-docs.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gh-docs-build
2
+ on:
3
+ push:
4
+ pull_request:
5
+ paths:
6
+ - "**"
7
+
8
+ # Set the access for individual scopes
9
+ permissions: write-all
10
+
11
+ jobs:
12
+ deploy:
13
+ runs-on: ubuntu-latest
14
+
15
+ container:
16
+ image: squidfunk/mkdocs-material
17
+
18
+ steps:
19
+ - uses: actions/checkout@v3
20
+ if: github.event.repository.fork == false
21
+ with:
22
+ ref: gh-pages-src
23
+
24
+ - name: "Correct github config"
25
+ if: github.event.repository.fork == false
26
+ run: |
27
+ git config --global --add safe.directory "$GITHUB_WORKSPACE"
28
+ git config --global user.name "${GITHUB_ACTOR}"
29
+ git config --global user.email "${GITHUB_ACTOR}@users.noreply.${GITHUB_DOMAIN:-"github.com"}"
30
+ remote_repo="https://x-access-token:${GITHUB_TOKEN}@${GITHUB_DOMAIN:-"github.com"}/${GITHUB_REPOSITORY}.git"
31
+ echo "${remote_repo}"
32
+ git remote rm origin
33
+ git remote add origin "${remote_repo}"
34
+
35
+ - name: "Deploy Github Page"
36
+ continue-on-error: true
37
+ run: mkdocs gh-deploy --force
38
+
.github/workflows/import-test.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI-Import-Check
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ paths:
7
+ - "**"
8
+
9
+ jobs:
10
+ ci-import-check:
11
+ runs-on: ubuntu-latest
12
+
13
+ # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
14
+ container:
15
+ image: pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
16
+
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+
20
+ - name: Update base dependencies
21
+ run: |
22
+ apt-get update && apt-get install -y build-essential
23
+ apt-get install -y libsndfile1 make
24
+
25
+ - name: Install nemo dependencies
26
+ id: nemo-wheel
27
+ run: |
28
+ # install test requirements
29
+ pip install -r requirements/requirements_test.txt
30
+ # Build nemo as a wheel
31
+ pip install build
32
+ python -m build --no-isolation --wheel
33
+ # Preserve wheel location
34
+ DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
35
+ echo "::set-output name=DIST_FILE::${DIST_FILE}"
36
+
37
+ - name: Test ASR Domain Imports
38
+ run: |
39
+ # Install NeMo Domain
40
+ pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
41
+ # Run import checks
42
+ python tests/core_ptl/check_imports.py --domain "asr"
43
+ # Uninstall NeMo
44
+ pip uninstall -y nemo_toolkit
45
+
46
+ - name: Test TTS Domain Imports
47
+ run: |
48
+ # Install NeMo Domain
49
+ pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
50
+ # Run import checks
51
+ python tests/core_ptl/check_imports.py --domain "tts"
52
+ # Uninstall NeMo
53
+ pip uninstall -y nemo_toolkit
54
+
55
+ - name: Test NLP Domain Imports
56
+ run: |
57
+ # Install NeMo Domain
58
+ pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[nlp]"
59
+ # Run import checks
60
+ python tests/core_ptl/check_imports.py --domain "nlp"
61
+ # Uninstall NeMo
62
+ pip uninstall -y nemo_toolkit
63
+
.github/workflows/labeler.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Pull Request Labeler"
2
+ on:
3
+ - pull_request_target
4
+
5
+ jobs:
6
+ triage:
7
+ permissions:
8
+ contents: read
9
+ pull-requests: write
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/labeler@v4
13
+ with:
14
+ repo-token: "${{ secrets.GITHUB_TOKEN }}"
.gitignore ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # log and data files
2
+ *.model
3
+ *.pkl
4
+ #*.ipynb
5
+ output
6
+ result
7
+ *.pt
8
+ tests/data/asr
9
+ .DS_Store
10
+ bert.pt.json
11
+ work
12
+ runs
13
+ fastspeech_output
14
+ .hydra
15
+ .bash_history.local
16
+
17
+ # Byte-compiled / optimized / DLL files
18
+ __pycache__/
19
+ *.py[cod]
20
+ *$py.class
21
+ **.pyc
22
+
23
+ # C extensions
24
+ *.so
25
+
26
+ # Distribution / packaging
27
+ .idea
28
+ .Python
29
+ wandb
30
+ build/
31
+ develop-eggs/
32
+ dist/
33
+ downloads/
34
+ eggs/
35
+ .eggs/
36
+ lib/
37
+ lib64/
38
+ #parts/
39
+ sdist/
40
+ var/
41
+ wheels/
42
+ pip-wheel-metadata/
43
+ share/python-wheels/
44
+ *.egg-info/
45
+ .installed.cfg
46
+ *.egg
47
+ MANIFEST
48
+
49
+ # PyInstaller
50
+ # Usually these files are written by a python script from a template
51
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
52
+ *.manifest
53
+ *.spec
54
+
55
+ # Installer logs
56
+ pip-log.txt
57
+ pip-delete-this-directory.txt
58
+
59
+ # Unit test / coverage reports
60
+ htmlcov/
61
+ .tox/
62
+ .nox/
63
+ .coverage
64
+ .coverage.*
65
+ .cache
66
+ nosetests.xml
67
+ coverage.xml
68
+ *.cover
69
+ .hypothesis/
70
+ .pytest_cache/
71
+
72
+ # Translations
73
+ *.mo
74
+ *.pot
75
+
76
+ # Django stuff:
77
+ *.log
78
+ local_settings.py
79
+ db.sqlite3
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/build
90
+
91
+ # PyBuilder
92
+ target/
93
+
94
+ # Jupyter Notebook
95
+ .ipynb_checkpoints
96
+
97
+ # Override Jupyter in Github Language states for more accurate estimate of repo code.
98
+ # Reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
99
+ *.ipynb linguist-generated
100
+
101
+ # IPython
102
+ profile_default/
103
+ ipython_config.py
104
+
105
+ # pyenv
106
+ .python-version
107
+
108
+ # pipenv
109
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
110
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
111
+ # having no cross-platform support, pipenv may install dependencies that don’t work, or not
112
+ # install all needed dependencies.
113
+ #Pipfile.lock
114
+
115
+ # celery beat schedule file
116
+ celerybeat-schedule
117
+
118
+ # SageMath parsed files
119
+ *.sage.py
120
+
121
+ # Environments
122
+ .env
123
+ .venv
124
+ env/
125
+ venv/
126
+ ENV/
127
+ env.bak/
128
+ venv.bak/
129
+
130
+ # VSCode project settins
131
+ .vscode/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+ /docs/html
143
+ /docs/docs_zh/zh
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # Emacs backup files
154
+ *~
155
+
156
+ cifar-10-batches-py
157
+ *.tar.gz
158
+
159
+ # Test data.
160
+ tests/.data
161
+ tests/data
162
+
163
+ # outputs folder
164
+ examples/*/outputs
165
+ examples/*/NeMo_experiments
166
+ examples/*/nemo_experiments
167
+ examples/*/.hydra
168
+ examples/*/wandb
169
+ examples/*/data
170
+ wandb
171
+ dump.py
172
+
173
+ docs/sources/source/test_build/
174
+
175
+ # Checkpoints, config files and temporary files created in tutorials.
176
+ examples/neural_graphs/*.chkpt
177
+ examples/neural_graphs/*.yml
178
+
179
+ .hydra/
180
+ nemo_experiments/
181
+
.pre-commit-config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+
34
+ - repo: https://github.com/PyCQA/isort
35
+ rev: 5.12.0
36
+ hooks:
37
+ - id: isort
38
+ name: Format imports
39
+ exclude: docs/
40
+
41
+ - repo: https://github.com/psf/black
42
+ rev: 19.10b0
43
+ hooks:
44
+ - id: black
45
+ name: Format code
46
+ args: [--skip-string-normalization, --line-length=119]
47
+ additional_dependencies: ['click==8.0.2']
.readthedocs.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # Copyright (c) 2020 NVIDIA. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # =============================================================================
16
+
17
+ # Read the Docs configuration file
18
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
19
+
20
+ # Required field.
21
+ version: 2
22
+
23
+ # Build documentation in the docs/ directory with Sphinx.
24
+ sphinx:
25
+ configuration: docs/source/conf.py
26
+
27
+ # Set the version of Python and requirements required to build your docs
28
+ python:
29
+ version: 3.8
30
+ install:
31
+ - requirements: requirements/requirements_docs.txt
CITATION.cff ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it as below."
3
+ title: "NeMo: a toolkit for Conversational AI and Large Language Models"
4
+ url: https://nvidia.github.io/NeMo/
5
+ repository-code: https://github.com/NVIDIA/NeMo
6
+ authors:
7
+ - family-names: Harper
8
+ given-names: Eric
9
+ - family-names: Majumdar
10
+ given-names: Somshubra
11
+ - family-names: Kuchaiev
12
+ given-names: Oleksii
13
+ - family-names: Jason
14
+ given-names: Li
15
+ - family-names: Zhang
16
+ given-names: Yang
17
+ - family-names: Bakhturina
18
+ given-names: Evelina
19
+ - family-names: Noroozi
20
+ given-names: Vahid
21
+ - family-names: Subramanian
22
+ given-names: Sandeep
23
+ - family-names: Nithin
24
+ given-names: Koluguri
25
+ - family-names: Jocelyn
26
+ given-names: Huang
27
+ - family-names: Jia
28
+ given-names: Fei
29
+ - family-names: Balam
30
+ given-names: Jagadeesh
31
+ - family-names: Yang
32
+ given-names: Xuesong
33
+ - family-names: Livne
34
+ given-names: Micha
35
+ - family-names: Dong
36
+ given-names: Yi
37
+ - family-names: Naren
38
+ given-names: Sean
39
+ - family-names: Ginsburg
40
+ given-names: Boris
41
+
CONTRIBUTING.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributions are welcome!
2
+
3
+ We do all of NeMo's development in the open. Contributions from NeMo community are welcome.
4
+
5
+
6
+ # Pull Requests (PR) Guidelines
7
+
8
+ **Send your PRs to the `main` branch**
9
+
10
+ 1) Make sure your PR does one thing. Have a clear answer to "What does this PR do?".
11
+ 2) Read General Principles and style guide below
12
+ 3) Make sure you sign your commits. E.g. use ``git commit -s`` when before your commit
13
+ 4) Make sure all unittests finish successfully before sending PR ``pytest`` or (if yor dev box does not have GPU) ``pytest --cpu`` from NeMo's root folder
14
+ 5) Send your PR and request a review
15
+
16
+ ## Unit tests
17
+ Quick tests (locally, while developing)
18
+ ```
19
+ pytest
20
+ # If you don't have NVIDIA GPU do:
21
+ # pytest --cpu
22
+ ```
23
+ Full tests, including pre-trained model downloads
24
+ ```
25
+ pytest --with_downloads
26
+ ```
27
+
28
+ ## Whom should you ask for review:
29
+ 1. For changes to NeMo's core: @ericharper, @titu1994, @blisc, or @okuchaiev
30
+ 1. For changes to NeMo's ASR collection: @titu1994, @redoctopus, @jbalam-nv, or @okuchaiev
31
+ 1. For changes to NeMo's NLP collection: @MaximumEntropy, @ericharper, @ekmb, @yzhang123, @VahidooX, @vladgets, or @okuchaiev
32
+ 1. For changes to NeMo's TTS collection: @blisc, or @okuchaiev
33
+
34
+ Note that some people may self-assign to review your PR - in which case, please wait for them to add a review.
35
+
36
+ Your pull requests must pass all checks and peer-review before they can be merged.
37
+
38
+ # General principles
39
+ 1. **User-oriented**: make it easy for end users, even at the cost of writing more code in the background
40
+ 1. **Robust**: make it hard for users to make mistakes.
41
+ 1. **Well-tested**: please add simple, fast unittests. Consider adding CI tests for end-to-end functionality.
42
+ 1. **Reusable**: for every piece of code, think about how it can be reused in the future and make it easy to be reused.
43
+ 1. **Readable**: code should be easier to read.
44
+ 1. **Legal**: if you copy even one line of code from the Internet, make sure that the code allows the license that NeMo supports. Give credit and link back to the code.
45
+ 1. **Sensible**: code should make sense. If you think a piece of code might be confusing, write comments.
46
+
47
+ ## Class naming conventions
48
+ * No “I”, “Interface”, “NM” nor “NeMo” pre/postfixes anywhere
49
+ * Core interfaces have simple names: Typing, Cloud, Serialization, FileIO*
50
+ * Core classes have the simplest names ever: NeuralModule, Model, Graph, Dataset, Loss, Module*
51
+ * Abstract classes in the Model hierarchy have Model postfix
52
+ * A config class for MyModel should be called MyModelConfig
53
+ * Leaf Neural Module classes have simple names without any postfixes (e.g. AudioPreprocess)
54
+ * Leaf Datasets have Dataset postfix (e.g. AudioToSpeechLabelDataset)
55
+ * Leaf Losses have Loss postfix (e.g. CTCLoss)
56
+ * Leaf Models do not have any postfix, just name (e.g. QuartzNet)
57
+
58
+ ## Python style
59
+ We use ``black`` as our style guide. To check whether your code will pass style check (from the NeMo's repo folder) run:
60
+ ``python setup.py style`` and if it does not pass run ``python setup.py style --fix``.
61
+
62
+ 1. Include docstrings for every class and method exposed to the user.
63
+ 1. Use Python 3 type hints for every class and method exposed to the user.
64
+ 1. Avoid wild import: ``from X import *`` unless in ``X.py``, ``__all__`` is defined.
65
+ 1. Minimize the use of ``**kwargs``.
66
+ 1. ``RaiseError`` is preferred to ``assert``. Write: ```if X: raise Error``` instead of ```assert X```.
67
+ 1. Classes are preferred to standalone methods.
68
+ 1. Methods should be atomic. A method shouldn't be longer than 75 lines, e.g. can be fit into the computer screen without scrolling.
69
+ 1. If a method has arguments that don't fit into one line, each argument should be in its own line for readability.
70
+ 1. Add ``__init__.py`` for every folder.
71
+ 1. F-strings are prefered to formatted strings.
72
+ 1. Loggers are preferred to print. In NeMo, you can use logger from ``from nemo.utils import logging``
73
+ 1. Private functions (functions start with ``_``) shouldn't be called outside its host file.
74
+ 1. If a comment lasts multiple lines, use ``'''`` instead of ``#``.
75
+
76
+ # Collections
77
+ Collection is a logical grouping of related Neural Modules. It is a grouping of modules that share a domain area or semantics.
78
+ When contributing module to a collection, please make sure it belongs to that category.
79
+ If you would like to start a new one and contribute back to the platform, you are very welcome to do so.
Dockerfile ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:experimental
2
+
3
+ # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.02-py3
18
+
19
+ # build an image that includes only the nemo dependencies, ensures that dependencies
20
+ # are included first for optimal caching, and useful for building a development
21
+ # image (by specifying build target as `nemo-deps`)
22
+ FROM ${BASE_IMAGE} as nemo-deps
23
+
24
+ # dependency flags; should be declared after FROM
25
+ # torchaudio: not required by default
26
+ ARG REQUIRE_TORCHAUDIO=false
27
+ # k2: not required by default
28
+ ARG REQUIRE_K2=false
29
+ # ais cli: not required by default, install only if required
30
+ ARG REQUIRE_AIS_CLI=false
31
+
32
+ # Ensure apt-get won't prompt for selecting options
33
+ ENV DEBIAN_FRONTEND=noninteractive
34
+ # libavdevice-dev rerquired for latest torchaudio
35
+ RUN apt-get update && \
36
+ apt-get upgrade -y && \
37
+ apt-get install -y \
38
+ libsndfile1 sox \
39
+ libfreetype6 \
40
+ swig \
41
+ ffmpeg \
42
+ libavdevice-dev && \
43
+ rm -rf /var/lib/apt/lists/*
44
+
45
+ WORKDIR /tmp/
46
+
47
+ # TODO: Remove once this Apex commit (2/24/23) is included in PyTorch
48
+ # container
49
+ RUN git clone https://github.com/NVIDIA/apex.git && \
50
+ cd apex && \
51
+ git checkout 03c9d80ed54c0eaa5b581bf42ceca3162f085327 && \
52
+ pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
53
+
54
+ # uninstall stuff from base container
55
+ RUN pip3 uninstall -y sacrebleu torchtext
56
+
57
+ # build torchaudio
58
+ WORKDIR /tmp/torchaudio_build
59
+ COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
60
+ RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
61
+ echo ${INSTALL_MSG}; \
62
+ if [ ${INSTALL_CODE} -ne 0 ]; then \
63
+ echo "torchaudio installation failed"; \
64
+ if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
65
+ exit ${INSTALL_CODE}; \
66
+ else echo "Skipping failed torchaudio installation"; fi \
67
+ else echo "torchaudio installed successfully"; fi
68
+
69
+ # install nemo dependencies
70
+ WORKDIR /tmp/nemo
71
+ COPY requirements .
72
+ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
73
+
74
+ # install k2, skip if installation fails
75
+ COPY scripts /tmp/nemo/scripts/
76
+ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \
77
+ echo ${INSTALL_MSG}; \
78
+ if [ ${INSTALL_CODE} -ne 0 ]; then \
79
+ echo "k2 installation failed"; \
80
+ if [ "${REQUIRE_K2}" = true ]; then \
81
+ exit ${INSTALL_CODE}; \
82
+ else echo "Skipping failed k2 installation"; fi \
83
+ else echo "k2 installed successfully"; fi
84
+
85
+ # copy nemo source into a scratch image
86
+ FROM scratch as nemo-src
87
+ COPY . .
88
+
89
+ # start building the final container
90
+ FROM nemo-deps as nemo
91
+ ARG NEMO_VERSION=1.17.0
92
+
93
+ # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
94
+ # version information as runtime environment variable for introspection purposes
95
+ RUN /usr/bin/test -n "$NEMO_VERSION" && \
96
+ /bin/echo "export NEMO_VERSION=${NEMO_VERSION}" >> /root/.bashrc && \
97
+ /bin/echo "export BASE_IMAGE=${BASE_IMAGE}" >> /root/.bashrc
98
+
99
+ # Install NeMo
100
+ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
101
+
102
+ # Check install
103
+ RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
104
+ python -c "import nemo.collections.tts as nemo_tts" && \
105
+ python -c "import nemo_text_processing.text_normalization as text_normalization"
106
+
107
+
108
+ # copy scripts/examples/tests into container for end user
109
+ WORKDIR /workspace/nemo
110
+ COPY scripts /workspace/nemo/scripts
111
+ COPY examples /workspace/nemo/examples
112
+ COPY tests /workspace/nemo/tests
113
+ COPY tutorials /workspace/nemo/tutorials
114
+ # COPY README.rst LICENSE /workspace/nemo/
115
+
116
+ RUN printf "#!/bin/bash\njupyter lab --no-browser --allow-root --ip=0.0.0.0" >> start-jupyter.sh && \
117
+ chmod +x start-jupyter.sh
118
+
119
+ # If required, install AIS CLI
120
+ RUN if [ "${REQUIRE_AIS_CLI}" = true ]; then \
121
+ INSTALL_MSG=$(/bin/bash scripts/installers/install_ais_cli_latest.sh); INSTALL_CODE=$?; \
122
+ echo ${INSTALL_MSG}; \
123
+ if [ ${INSTALL_CODE} -ne 0 ]; then \
124
+ echo "AIS CLI installation failed"; \
125
+ exit ${INSTALL_CODE}; \
126
+ else echo "AIS CLI installed successfully"; fi \
127
+ else echo "Skipping AIS CLI installation"; fi
Jenkinsfile ADDED
The diff for this file is too large to render. See raw diff
 
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
PUBLICATIONS.md ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Publications
2
+
3
+ Here, we list a collection of research articles that utilize the NeMo Toolkit. If you would like to include your paper in this collection, please submit a PR updating this document.
4
+
5
+ -------
6
+
7
+ # Automatic Speech Recognition (ASR)
8
+
9
+ <details>
10
+ <summary>2023</summary>
11
+
12
+ * [Fast Entropy-Based Methods of Word-Level Confidence Estimation for End-to-End Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10022960)
13
+ * [Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10023219)
14
+
15
+ </details>
16
+
17
+ <details>
18
+ <summary>2022</summary>
19
+
20
+ * [Multi-blank Transducers for Speech Recognition](https://arxiv.org/abs/2211.03541)
21
+
22
+ </details>
23
+
24
+ <details>
25
+ <summary>2021</summary>
26
+
27
+ * [Citrinet: Closing the Gap between Non-Autoregressive and Autoregressive End-to-End Models for Automatic Speech Recognition](https://arxiv.org/abs/2104.01721)
28
+ * [SPGISpeech: 5,000 hours of transcribed financial audio for fully formatted end-to-end speech recognition](https://www.isca-speech.org/archive/interspeech_2021/oneill21_interspeech.html)
29
+ * [CarneliNet: Neural Mixture Model for Automatic Speech Recognition](https://arxiv.org/abs/2107.10708)
30
+ * [CTC Variations Through New WFST Topologies](https://arxiv.org/abs/2110.03098)
31
+ * [A Toolbox for Construction and Analysis of Speech Datasets](https://openreview.net/pdf?id=oJ0oHQtAld)
32
+
33
+ </details>
34
+
35
+
36
+ <details>
37
+ <summary>2020</summary>
38
+
39
+ * [Cross-Language Transfer Learning, Continuous Learning, and Domain Adaptation for End-to-End Automatic Speech Recognition](https://ieeexplore.ieee.org/document/9428334)
40
+ * [Correction of Automatic Speech Recognition with Transformer Sequence-To-Sequence Model](https://ieeexplore.ieee.org/abstract/document/9053051)
41
+ * [Improving Noise Robustness of an End-to-End Neural Model for Automatic Speech Recognition](https://arxiv.org/abs/2010.12715)
42
+
43
+ </details>
44
+
45
+
46
+ <details>
47
+ <summary>2019</summary>
48
+
49
+ * [Jasper: An End-to-End Convolutional Neural Acoustic Model](https://arxiv.org/abs/1904.03288)
50
+ * [QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/abs/1910.10261)
51
+
52
+
53
+ </details>
54
+
55
+
56
+ --------
57
+
58
+
59
+ ## Speaker Recognition (SpkR)
60
+
61
+ <details>
62
+ <summary>2022</summary>
63
+
64
+ * [TitaNet: Neural Model for Speaker Representation with 1D Depth-Wise Separable Convolutions and Global Context](https://ieeexplore.ieee.org/abstract/document/9746806)
65
+
66
+ </details>
67
+
68
+
69
+ <details>
70
+ <summary>2020</summary>
71
+
72
+ * [SpeakerNet: 1D Depth-wise Separable Convolutional Network for Text-Independent Speaker Recognition and Verification]( https://arxiv.org/pdf/2010.12653.pdf)
73
+
74
+ </details>
75
+
76
+ --------
77
+
78
+ ## Speech Classification
79
+
80
+ <details>
81
+ <summary>2022</summary>
82
+
83
+ * [AmberNet: A Compact End-to-End Model for Spoken Language Identification](https://arxiv.org/abs/2210.15781)
84
+ * [Accidental Learners: Spoken Language Identification in Multilingual Self-Supervised Models](https://arxiv.org/abs/2211.05103)
85
+
86
+
87
+ </details>
88
+
89
+ <details>
90
+ <summary>2021</summary>
91
+
92
+ * [MarbleNet: Deep 1D Time-Channel Separable Convolutional Neural Network for Voice Activity Detection](https://ieeexplore.ieee.org/abstract/document/9414470/)
93
+
94
+ </details>
95
+
96
+
97
+ <details>
98
+ <summary>2020</summary>
99
+
100
+ * [MatchboxNet - 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=337&id=993)
101
+
102
+ </details>
103
+
104
+
105
+ --------
106
+
107
+ ## Speech Translation
108
+
109
+ <details>
110
+ <summary>2022</summary>
111
+
112
+ * [NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022](https://aclanthology.org/2022.iwslt-1.18/)
113
+
114
+ </details>
115
+
116
+
117
+ --------
118
+
119
+ # Natural Language Processing (NLP)
120
+
121
+ ## Language Modeling
122
+
123
+ <details>
124
+ <summary>2022</summary>
125
+
126
+ * [Evaluating Parameter Efficient Learning for Generation](https://arxiv.org/abs/2210.13673)
127
+ * [Text Mining Drug/Chemical-Protein Interactions using an Ensemble of BERT and T5 Based Models](https://arxiv.org/abs/2111.15617)
128
+
129
+ </details>
130
+
131
+ <details>
132
+ <summary>2021</summary>
133
+
134
+ * [BioMegatron: Larger Biomedical Domain Language Model ](https://aclanthology.org/2020.emnlp-main.379/)
135
+
136
+ </details>
137
+
138
+ ## Neural Machine Translation
139
+
140
+ <details>
141
+ <summary>2022</summary>
142
+
143
+ * [Finding the Right Recipe for Low Resource Domain Adaptation in Neural Machine Translation](https://arxiv.org/abs/2206.01137)
144
+
145
+ </details>
146
+
147
+ <details>
148
+ <summary>2021</summary>
149
+
150
+ * [NVIDIA NeMo Neural Machine Translatio Systems for English-German and English-Russian News and Biomedical Tasks at WMT21](https://arxiv.org/pdf/2111.08634.pdf)
151
+
152
+ </details>
153
+
154
+ --------
155
+
156
+ ## Dialogue State Tracking
157
+
158
+ <details>
159
+ <summary>2021</summary>
160
+
161
+ * [SGD-QA: Fast Schema-Guided Dialogue State Tracking for Unseen Services](https://arxiv.org/abs/2105.08049)
162
+
163
+ </details>
164
+
165
+ <details>
166
+ <summary>2020</summary>
167
+
168
+ * [A Fast and Robust BERT-based Dialogue State Tracker for Schema-Guided Dialogue Dataset](https://arxiv.org/abs/2008.12335)
169
+
170
+ </details>
171
+ --------
172
+
173
+
174
+ # Text To Speech (TTS)
175
+
176
+ <details>
177
+ <summary>2022</summary>
178
+
179
+ * [Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
180
+
181
+ </details>
182
+
183
+ <details>
184
+ <summary>2021</summary>
185
+
186
+ * [TalkNet: Fully-Convolutional Non-Autoregressive Speech Synthesis Model](https://www.isca-speech.org/archive/interspeech_2021/beliaev21_interspeech.html)
187
+ * [TalkNet 2: Non-Autoregressive Depth-Wise Separable Convolutional Model for Speech Synthesis with Explicit Pitch and Duration Prediction](https://arxiv.org/abs/2104.08189)
188
+ * [Hi-Fi Multi-Speaker English TTS Dataset](https://www.isca-speech.org/archive/pdfs/interspeech_2021/bakhturina21_interspeech.pdf)
189
+ * [Mixer-TTS: non-autoregressive, fast and compact text-to-speech model conditioned on language model embeddings](https://arxiv.org/abs/2110.03584)
190
+
191
+ </details>
192
+
193
+
194
+ --------
195
+
196
+ # (Inverse) Text Normalization
197
+ <details>
198
+ <summary>2022</summary>
199
+
200
+ * [Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization](https://arxiv.org/abs/2203.15917)
201
+ * [Thutmose Tagger: Single-pass neural model for Inverse Text Normalization](https://arxiv.org/abs/2208.00064)
202
+
203
+ </details>
204
+
205
+ <details>
206
+ <summary>2021</summary>
207
+
208
+ * [NeMo Inverse Text Normalization: From Development to Production](https://www.isca-speech.org/archive/pdfs/interspeech_2021/zhang21ga_interspeech.pdf)
209
+ * [A Unified Transformer-based Framework for Duplex Text Normalization](https://arxiv.org/pdf/2108.09889.pdf )
210
+
211
+ </details>
212
+
213
+ --------
README.rst ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ |status| |documentation| |codeql| |license| |pypi| |pyversion| |downloads| |black|
3
+
4
+ .. |status| image:: http://www.repostatus.org/badges/latest/active.svg
5
+ :target: http://www.repostatus.org/#active
6
+ :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed.
7
+
8
+ .. |documentation| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main
9
+ :alt: Documentation
10
+ :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/
11
+
12
+ .. |license| image:: https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg
13
+ :target: https://github.com/NVIDIA/NeMo/blob/master/LICENSE
14
+ :alt: NeMo core license and license for collections in this repo
15
+
16
+ .. |pypi| image:: https://badge.fury.io/py/nemo-toolkit.svg
17
+ :target: https://badge.fury.io/py/nemo-toolkit
18
+ :alt: Release version
19
+
20
+ .. |pyversion| image:: https://img.shields.io/pypi/pyversions/nemo-toolkit.svg
21
+ :target: https://badge.fury.io/py/nemo-toolkit
22
+ :alt: Python version
23
+
24
+ .. |downloads| image:: https://static.pepy.tech/personalized-badge/nemo-toolkit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads
25
+ :target: https://pepy.tech/project/nemo-toolkit
26
+ :alt: PyPi total downloads
27
+
28
+ .. |codeql| image:: https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&event=push
29
+ :target: https://github.com/nvidia/nemo/actions/workflows/codeql.yml
30
+ :alt: CodeQL
31
+
32
+ .. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
33
+ :target: https://github.com/psf/black
34
+ :alt: Code style: black
35
+
36
+ .. _main-readme:
37
+
38
+ **NVIDIA NeMo**
39
+ ===============
40
+
41
+ Introduction
42
+ ------------
43
+
44
+ NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR),
45
+ text-to-speech synthesis (TTS), large language models (LLMs), and
46
+ natural language processing (NLP).
47
+ The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models)
48
+ and make it easier to create new `conversational AI models <https://developer.nvidia.com/conversational-ai#started>`_.
49
+
50
+ All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
51
+ training is automatically scalable to 1000s of GPUs.
52
+ Additionally, NeMo Megatron LLM models can be trained up to 1 trillion parameters using tensor and pipeline model parallelism.
53
+ NeMo models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
54
+
55
+ Getting started with NeMo is simple.
56
+ State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
57
+ `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
58
+ These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code.
59
+
60
+ We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
61
+ can all be run on `Google Colab <https://colab.research.google.com>`_.
62
+
63
+ For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
64
+ we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
65
+
66
+ For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
67
+ The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
68
+ which can be used to find the optimal model parallel configuration for training on a specific cluster.
69
+
70
+ Also see our `introductory video <https://www.youtube.com/embed/wBgpMf_KQVw>`_ for a high level overview of NeMo.
71
+
72
+ Key Features
73
+ ------------
74
+
75
+ * Speech processing
76
+ * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) <https://huggingface.co/spaces/smajumdar/nemo_multilingual_language_id>`_
77
+ * `Automatic Speech Recognition (ASR) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/intro.html>`_
78
+ * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, FastConformer-CTC, FastConformer-Transducer...
79
+ * Supports CTC and Transducer/RNNT losses/decoders
80
+ * NeMo Original `Multi-blank Transducers <https://arxiv.org/abs/2211.03541>`_
81
+ * Beam Search decoding
82
+ * `Language Modelling for ASR <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html>`_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer
83
+ * Streaming and Buffered ASR (CTC/Transducer) - `Chunked Inference Examples <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_chunked_inference>`_
84
+ * `Support of long audios for Conformer with memory efficient local attention <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html#inference-on-long-audio>`_
85
+ * `Speech Classification, Speech Command Recognition and Language Identification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/intro.html>`_: MatchboxNet (Command Recognition), AmberNet (LangID)
86
+ * `Voice activity Detection (VAD) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad>`_: MarbleNet
87
+ * ASR with VAD Inference - `Example <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_vad>`_
88
+ * `Speaker Recognition <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html>`_: TitaNet, ECAPA_TDNN, SpeakerNet
89
+ * `Speaker Diarization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html>`_
90
+ * Clustering Diarizer: TitaNet, ECAPA_TDNN, SpeakerNet
91
+ * Neural Diarizer: MSDD (Multi-scale Diarization Decoder)
92
+ * `Speech Intent Detection and Slot Filling <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_intent_slot/intro.html>`_: Conformer-Transformer
93
+ * `Pretrained models on different languages. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_: English, Spanish, German, Russian, Chinese, French, Italian, Polish, ...
94
+ * `NGC collection of pre-trained speech processing models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_
95
+ * Natural Language Processing
96
+ * `NeMo Megatron pre-training of Large Language Models <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/intro.html>`_
97
+ * `Neural Machine Translation (NMT) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/machine_translation/machine_translation.html>`_
98
+ * `Punctuation and Capitalization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html>`_
99
+ * `Token classification (named entity recognition) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/token_classification.html>`_
100
+ * `Text classification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_classification.html>`_
101
+ * `Joint Intent and Slot Classification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/joint_intent_slot.html>`_
102
+ * `Question answering <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/question_answering.html>`_
103
+ * `GLUE benchmark <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/glue_benchmark.html>`_
104
+ * `Information retrieval <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/information_retrieval.html>`_
105
+ * `Entity Linking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/entity_linking.html>`_
106
+ * `Dialogue State Tracking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/sgd_qa.html>`_
107
+ * `Prompt Learning <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html>`_
108
+ * `NGC collection of pre-trained NLP models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_nlp>`_
109
+ * `Synthetic Tabular Data Generation <https://developer.nvidia.com/blog/generating-synthetic-data-with-transformers-a-solution-for-enterprise-data-challenges/>`_
110
+ * `Speech synthesis (TTS) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/intro.html#>`_
111
+ * Spectrogram generation: Tacotron2, GlowTTS, TalkNet, FastPitch, FastSpeech2, Mixer-TTS, Mixer-TTS-X
112
+ * Vocoders: WaveGlow, SqueezeWave, UniGlow, MelGAN, HiFiGAN, UnivNet
113
+ * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E, VITS
114
+ * `NGC collection of pre-trained TTS models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_tts>`_
115
+ * `Tools <https://github.com/NVIDIA/NeMo/tree/stable/tools>`_
116
+ * `Text Processing (text normalization and inverse text normalization) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization/intro.html>`_
117
+ * `CTC-Segmentation tool <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/ctc_segmentation.html>`_
118
+ * `Speech Data Explorer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html>`_: a dash-based tool for interactive exploration of ASR/TTS datasets
119
+
120
+
121
+ Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes.
122
+
123
+ Requirements
124
+ ------------
125
+
126
+ 1) Python 3.8 or above
127
+ 2) Pytorch 1.10.0 or above
128
+ 3) NVIDIA GPU for training
129
+
130
+ Documentation
131
+ -------------
132
+
133
+ .. |main| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main
134
+ :alt: Documentation Status
135
+ :scale: 100%
136
+ :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/
137
+
138
+ .. |stable| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=stable
139
+ :alt: Documentation Status
140
+ :scale: 100%
141
+ :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/
142
+
143
+ +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
144
+ | Version | Status | Description |
145
+ +=========+=============+==========================================================================================================================================+
146
+ | Latest | |main| | `Documentation of the latest (i.e. main) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/>`_ |
147
+ +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
148
+ | Stable | |stable| | `Documentation of the stable (i.e. most recent release) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/>`_ |
149
+ +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
150
+
151
+ Tutorials
152
+ ---------
153
+ A great way to start with NeMo is by checking `one of our tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_.
154
+
155
+ Getting help with NeMo
156
+ ----------------------
157
+ FAQ can be found on NeMo's `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions there.
158
+
159
+
160
+ Installation
161
+ ------------
162
+
163
+ Conda
164
+ ~~~~~
165
+
166
+ We recommend installing NeMo in a fresh Conda environment.
167
+
168
+ .. code-block:: bash
169
+
170
+ conda create --name nemo python==3.8.10
171
+ conda activate nemo
172
+
173
+ Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_.
174
+
175
+ .. code-block:: bash
176
+
177
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
178
+
179
+ The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
180
+
181
+ Pip
182
+ ~~~
183
+ Use this installation mode if you want the latest released version.
184
+
185
+ .. code-block:: bash
186
+
187
+ apt-get update && apt-get install -y libsndfile1 ffmpeg
188
+ pip install Cython
189
+ pip install nemo_toolkit['all']
190
+
191
+ Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
192
+
193
+ Pip from source
194
+ ~~~~~~~~~~~~~~~
195
+ Use this installation mode if you want the version from a particular GitHub branch (e.g main).
196
+
197
+ .. code-block:: bash
198
+
199
+ apt-get update && apt-get install -y libsndfile1 ffmpeg
200
+ pip install Cython
201
+ python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
202
+
203
+
204
+ From source
205
+ ~~~~~~~~~~~
206
+ Use this installation mode if you are contributing to NeMo.
207
+
208
+ .. code-block:: bash
209
+
210
+ apt-get update && apt-get install -y libsndfile1 ffmpeg
211
+ git clone https://github.com/NVIDIA/NeMo
212
+ cd NeMo
213
+ ./reinstall.sh
214
+
215
+ If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh``
216
+ with ``pip install -e .`` when your PWD is the root of the NeMo repository.
217
+
218
+ RNNT
219
+ ~~~~
220
+ Note that RNNT requires numba to be installed from conda.
221
+
222
+ .. code-block:: bash
223
+
224
+ conda remove numba
225
+ pip uninstall numba
226
+ conda install -c conda-forge numba
227
+
228
+ NeMo Megatron
229
+ ~~~~~~~~~~~~~
230
+ NeMo Megatron training requires NVIDIA Apex to be installed.
231
+ Install it manually if not using the NVIDIA PyTorch container.
232
+
233
+ .. code-block:: bash
234
+
235
+ git clone https://github.com/NVIDIA/apex.git
236
+ cd apex
237
+ git checkout 03c9d80ed54c0eaa5b581bf42ceca3162f085327
238
+ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
239
+
240
+ It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.
241
+
242
+ While installing Apex, it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with.
243
+ This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32
244
+
245
+ cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using:
246
+
247
+ .. code-block:: bash
248
+
249
+ conda install -c nvidia cuda-nvprof=11.8
250
+
251
+ packaging is also needed:
252
+
253
+ .. code-block:: bash
254
+
255
+ pip install -y packaging
256
+
257
+
258
+ Transformer Engine
259
+ ~~~~~~~~~~~~~~~~~~
260
+ NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_
261
+ Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.
262
+ `Install <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_ it manually if not using the NVIDIA PyTorch container.
263
+
264
+ .. code-block:: bash
265
+
266
+ pip install --upgrade git+https://github.com/NVIDIA/TransformerEngine.git@stable
267
+
268
+ It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Transformer Engine or any other dependencies.
269
+
270
+ Transformer Engine requires PyTorch to be built with CUDA 11.8.
271
+
272
+ NeMo Text Processing
273
+ ~~~~~~~~~~~~~~~~~~~~
274
+ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
275
+
276
+ Docker containers:
277
+ ~~~~~~~~~~~~~~~~~~
278
+ We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.16.0`` comes with container ``nemo:23.01``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
279
+
280
+ To use built container, please run
281
+
282
+ .. code-block:: bash
283
+
284
+ docker pull nvcr.io/nvidia/nemo:23.01
285
+
286
+ To build a nemo container with Dockerfile from a branch, please run
287
+
288
+ .. code-block:: bash
289
+
290
+ DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
291
+
292
+
293
+ If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.02-py3 and then installing from GitHub.
294
+
295
+ .. code-block:: bash
296
+
297
+ docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
298
+ -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
299
+ stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.02-py3
300
+
301
+ Examples
302
+ --------
303
+
304
+ Many examples can be found under the `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
305
+
306
+
307
+ Contributing
308
+ ------------
309
+
310
+ We welcome community contributions! Please refer to the `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_ CONTRIBUTING.md for the process.
311
+
312
+ Publications
313
+ ------------
314
+
315
+ We provide an ever growing list of publications that utilize the NeMo framework. Please refer to `PUBLICATIONS.md <https://github.com/NVIDIA/NeMo/tree/stable/PUBLICATIONS.md>`_. We welcome the addition of your own articles to this list !
316
+
317
+ License
318
+ -------
319
+ NeMo is under `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
ci.groovy ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @Library('blossom-github-lib@master')
2
+ import ipp.blossom.*
3
+
4
+ podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """
5
+ apiVersion: v1
6
+ kind: Pod
7
+ metadata:
8
+ labels:
9
+ some-label: some-label-value
10
+ spec:
11
+ volumes:
12
+ - name: scratch
13
+ nfs:
14
+ server: ipp1-cdot01-col01
15
+ path: /vol/scratch1/scratch.okuchaiev_blossom
16
+ containers:
17
+ - name: latestdlfw
18
+ image: nvcr.io/nvidia/pytorch:23.02-py3
19
+ command:
20
+ - cat
21
+ volumeMounts:
22
+ - name: scratch
23
+ mountPath: /testdata
24
+ resources:
25
+ limits:
26
+ nvidia.com/gpu: 2
27
+ restartPolicy: Never
28
+ backoffLimit: 4
29
+ tty: true
30
+ shm-size: 32g
31
+ nodeSelector:
32
+ kubernetes.io/os: linux
33
+ nvidia.com/gpu_type: "Tesla_T4x4"
34
+ nvidia.com/node_type: gpu_tester
35
+ nvidia.com/driver_version: "510.20"
36
+ """
37
+ ) {
38
+ node(POD_LABEL) {
39
+ def githubHelper
40
+ stage('Get Token') {
41
+ withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) {
42
+ // create new instance of helper object
43
+ githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData)
44
+ }
45
+
46
+ }
47
+ def stageName = ''
48
+ try {
49
+ currentBuild.description = githubHelper.getBuildDescription()
50
+ container('latestdlfw') {
51
+ stage('Code checkout') {
52
+ // update status on github
53
+ githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING)
54
+ checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]],
55
+ doGenerateSubmoduleConfigurations: false,
56
+ submoduleCfg: [],
57
+ userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]]
58
+ }
59
+
60
+ stage('Code Style') {
61
+ sh "apt-get update && \
62
+ apt-get install -y bc && \
63
+ nvidia-smi && \
64
+ pip install -r requirements/requirements_test.txt && \
65
+ python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \
66
+ ls -l /home && ls -l /home/TestData"
67
+ }
68
+
69
+ stage('Installation') {
70
+ sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release"
71
+ }
72
+
73
+ stage('L0: GPU unit tests') {
74
+ sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'"
75
+ }
76
+
77
+ parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here
78
+ [
79
+ "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \
80
+ --config-path=conf \
81
+ --config-name=aayn_base \
82
+ do_testing=true \
83
+ model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
84
+ model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
85
+ model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
86
+ model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
87
+ model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
88
+ model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
89
+ model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
90
+ model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
91
+ model.encoder.pre_ln=true \
92
+ model.decoder.pre_ln=true \
93
+ trainer.devices=[0] \
94
+ trainer.accelerator="gpu" \
95
+ +trainer.fast_dev_run=true \
96
+ +trainer.limit_test_batches=2 \
97
+ exp_manager=null \
98
+ '},
99
+ "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \
100
+ model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \
101
+ model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \
102
+ trainer.devices=[0] \
103
+ trainer.accelerator="gpu" \
104
+ +trainer.fast_dev_run=True \
105
+ exp_manager=null \
106
+ '}
107
+ ]
108
+ )//end of parallel
109
+ }
110
+ githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS)
111
+ }
112
+ catch (Exception ex){
113
+ currentBuild.result = 'FAILURE'
114
+ println ex
115
+ githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE)
116
+ }
117
+
118
+ }
119
+ }
docs/.nojekyll ADDED
File without changes
docs/Makefile ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line.
5
+ SPHINXOPTS =
6
+ SPHINXBUILD = sphinx-build
7
+ PAPER =
8
+ BUILDDIR = build
9
+
10
+ # User-friendly check for sphinx-build
11
+ ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12
+ $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13
+ endif
14
+
15
+ # Internal variables.
16
+ PAPEROPT_a4 = -D latex_paper_size=a4
17
+ PAPEROPT_letter = -D latex_paper_size=letter
18
+ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
19
+ # the i18n builder cannot share the environment and doctrees with the others
20
+ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
21
+
22
+ .PHONY: help
23
+ help:
24
+ @echo "Please use \`make <target>' where <target> is one of"
25
+ @echo " html to make standalone HTML files"
26
+ @echo " dirhtml to make HTML files named index.html in directories"
27
+ @echo " singlehtml to make a single large HTML file"
28
+ @echo " pickle to make pickle files"
29
+ @echo " json to make JSON files"
30
+ @echo " htmlhelp to make HTML files and a HTML help project"
31
+ @echo " qthelp to make HTML files and a qthelp project"
32
+ @echo " applehelp to make an Apple Help Book"
33
+ @echo " devhelp to make HTML files and a Devhelp project"
34
+ @echo " epub to make an epub"
35
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
37
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38
+ @echo " text to make text files"
39
+ @echo " man to make manual pages"
40
+ @echo " texinfo to make Texinfo files"
41
+ @echo " info to make Texinfo files and run them through makeinfo"
42
+ @echo " gettext to make PO message catalogs"
43
+ @echo " changes to make an overview of all changed/added/deprecated items"
44
+ @echo " xml to make Docutils-native XML files"
45
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46
+ @echo " linkcheck to check all external links for integrity"
47
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48
+ @echo " coverage to run coverage check of the documentation (if enabled)"
49
+
50
+ .PHONY: clean
51
+ clean:
52
+ rm -rf $(BUILDDIR)/*
53
+
54
+ .PHONY: html
55
+ html:
56
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
57
+ @echo
58
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
59
+
60
+ .PHONY: dirhtml
61
+ dirhtml:
62
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
63
+ @echo
64
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
65
+
66
+ .PHONY: singlehtml
67
+ singlehtml:
68
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
69
+ @echo
70
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
71
+
72
+ .PHONY: pickle
73
+ pickle:
74
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
75
+ @echo
76
+ @echo "Build finished; now you can process the pickle files."
77
+
78
+ .PHONY: json
79
+ json:
80
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
81
+ @echo
82
+ @echo "Build finished; now you can process the JSON files."
83
+
84
+ .PHONY: htmlhelp
85
+ htmlhelp:
86
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
87
+ @echo
88
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
89
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
90
+
91
+ .PHONY: qthelp
92
+ qthelp:
93
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
94
+ @echo
95
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
96
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
97
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OpenSeq2Seq.qhcp"
98
+ @echo "To view the help file:"
99
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OpenSeq2Seq.qhc"
100
+
101
+ .PHONY: applehelp
102
+ applehelp:
103
+ $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104
+ @echo
105
+ @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106
+ @echo "N.B. You won't be able to view it unless you put it in" \
107
+ "~/Library/Documentation/Help or install it in your application" \
108
+ "bundle."
109
+
110
+ .PHONY: devhelp
111
+ devhelp:
112
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113
+ @echo
114
+ @echo "Build finished."
115
+ @echo "To view the help file:"
116
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/OpenSeq2Seq"
117
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/OpenSeq2Seq"
118
+ @echo "# devhelp"
119
+
120
+ .PHONY: epub
121
+ epub:
122
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123
+ @echo
124
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125
+
126
+ .PHONY: latex
127
+ latex:
128
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129
+ @echo
130
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
132
+ "(use \`make latexpdf' here to do that automatically)."
133
+
134
+ .PHONY: latexpdf
135
+ latexpdf:
136
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137
+ @echo "Running LaTeX files through pdflatex..."
138
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
139
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140
+
141
+ .PHONY: latexpdfja
142
+ latexpdfja:
143
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144
+ @echo "Running LaTeX files through platex and dvipdfmx..."
145
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147
+
148
+ .PHONY: text
149
+ text:
150
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151
+ @echo
152
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
153
+
154
+ .PHONY: man
155
+ man:
156
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157
+ @echo
158
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159
+
160
+ .PHONY: texinfo
161
+ texinfo:
162
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163
+ @echo
164
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165
+ @echo "Run \`make' in that directory to run these through makeinfo" \
166
+ "(use \`make info' here to do that automatically)."
167
+
168
+ .PHONY: info
169
+ info:
170
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171
+ @echo "Running Texinfo files through makeinfo..."
172
+ make -C $(BUILDDIR)/texinfo info
173
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174
+
175
+ .PHONY: gettext
176
+ gettext:
177
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178
+ @echo
179
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180
+
181
+ .PHONY: changes
182
+ changes:
183
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184
+ @echo
185
+ @echo "The overview file is in $(BUILDDIR)/changes."
186
+
187
+ .PHONY: linkcheck
188
+ linkcheck:
189
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190
+ @echo
191
+ @echo "Link check complete; look for any errors in the above output " \
192
+ "or in $(BUILDDIR)/linkcheck/output.txt."
193
+
194
+ .PHONY: doctest
195
+ doctest:
196
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197
+ @echo "Testing of doctests in the sources finished, look at the " \
198
+ "results in $(BUILDDIR)/doctest/output.txt."
199
+
200
+ .PHONY: coverage
201
+ coverage:
202
+ $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203
+ @echo "Testing of coverage in the sources finished, look at the " \
204
+ "results in $(BUILDDIR)/coverage/python.txt."
205
+
206
+ .PHONY: xml
207
+ xml:
208
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209
+ @echo
210
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211
+
212
+ .PHONY: pseudoxml
213
+ pseudoxml:
214
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215
+ @echo
216
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
docs/source/_static/css/custom.css ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Import the Roboto Thin Font */
2
+ @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400&display=swap');
3
+
4
+ body {
5
+ font-size: 100%;
6
+ font-family: 'Roboto', sans-serif;
7
+ }
8
+
9
+
10
+ /* Width of template */
11
+
12
+ .wy-nav-content {
13
+ max-width: 1200px !important;
14
+ }
15
+
16
+
17
+
18
+ /* Standard Text Formatting */
19
+
20
+ h1 {
21
+ color: #76b900;
22
+ text-align: center;
23
+ /* background-color: #ffffff; */
24
+ }
25
+
26
+ h2 {
27
+ color: #ffffff;
28
+ /* background-color: #ffffff; */
29
+ /* #76b900 */
30
+ Padding: 5px;
31
+ }
32
+
33
+ h3 {
34
+ padding-top: 0px;
35
+ border-top: solid 3px #000000;
36
+ /* #76b900 */
37
+ border-bottom: solid 3px #000000;
38
+ /* #76b900 */
39
+ }
40
+
41
+ p {
42
+ margin-bottom: 24px;
43
+ }
44
+
45
+ /* Link Colors */
46
+ a {
47
+ color: #76b900;
48
+ }
49
+
50
+ a:visited {
51
+ color: #218219;
52
+ }
53
+
54
+ .container-xl {
55
+ margin-right: unset;
56
+ margin-left: unset;
57
+ }
58
+
59
+ section {
60
+ overflow-x: auto;
61
+ }
62
+
63
+ /* ----------------------------------------------TABLES--------------------------------------- */
64
+ section table {
65
+ overflow-x: auto;
66
+ display: block;
67
+ }
68
+
69
+ table {
70
+ font-size: small;
71
+ }
72
+
73
+ /* Table head Color */
74
+ thead td {
75
+ background-color: #333333 !important;
76
+ }
77
+
78
+ .row-odd p {
79
+ /*padding-bottom: 0px;*/
80
+ /*margin-bottom: 0px;*/
81
+ }
82
+
83
+ /* even rows*/
84
+
85
+ .row-even tr {
86
+ background-color: #e5f1e6 !important;
87
+ }
88
+
89
+ /* odd rows*/
90
+
91
+
92
+ .wy-table-responsive table tr {
93
+ background-color: #ffffff !important;
94
+ }
95
+
96
+
97
+
98
+ .wy-table-responsive table td {
99
+ white-space: normal;
100
+ }
101
+
102
+
103
+ /* Removes bottom margin in tables*/
104
+
105
+ .rst-content .line-block {
106
+ margin-bottom: 0px;
107
+ }
108
+
109
+ .wy-table-responsive {
110
+ overflow: visible !important;
111
+ }
112
+
113
+ /* reduces the size of text in multiline table columns. */
114
+
115
+ .rst-content table.docutils td {
116
+ font-size: 80%;
117
+ }
118
+
119
+ .rst-content dl:not(.docutils) dt {
120
+
121
+ background-color: inherit;
122
+ color: #000000;
123
+ border-top: solid 0px #000000;
124
+
125
+ }
126
+
127
+ .rst-content dl:not(.docutils) dt:before {
128
+ color: #333333;
129
+ }
130
+
131
+ .rst-content .line-block {
132
+ margin-bottom: 0px;
133
+ }
134
+
135
+ .wy-side-nav-search,
136
+ .wy-nav-top {
137
+ background-color: #000000;
138
+ padding: 0;
139
+ }
140
+
141
+ .wy-side-nav-search img {
142
+ padding: 0px;
143
+ padding: 0px 0px;
144
+ margin-bottom: 0;
145
+ }
146
+
147
+ .wy-side-nav-search input[type=text] {
148
+ border-radius: 0px;
149
+ }
150
+
151
+
152
+ .wy-menu-vertical p.caption {
153
+ color: #76b900;
154
+ }
155
+
156
+
157
+ .wy-side-nav-search>a img.logo,
158
+ .wy-side-nav-search .wy-dropdown>a img.logo {
159
+ margin: 0px 0px 0px 0px;
160
+ }
161
+
162
+ .wy-nav-content {
163
+ margin: 0;
164
+ min-height: 100%;
165
+ height: 100%;
166
+ background: #ffffff;
167
+ }
168
+
169
+ /* List (numbered, bulleted) padding Fix */
170
+
171
+
172
+ .wy-plain-list-decimal li {
173
+ margin-top: -6px;
174
+ margin-bottom: -6px;
175
+ }
176
+
177
+ .rst-content .section ol.loweralpha {
178
+ margin-top: -6px;
179
+ margin-bottom: 12px;
180
+ }
181
+
182
+ .wy-plain-list-disc,
183
+ .rst-content .toctree-wrapper ul,
184
+ article ul {
185
+ margin-top: 0px !important;
186
+ margin-bottom: 12px;
187
+ }
188
+
189
+ /* Alert Boxes */
190
+ /* Background color of Alert Box Title */
191
+
192
+ .rst-content .section ul {
193
+ margin-top: -12px;
194
+ margin-bottom: 16px;
195
+ }
196
+
197
+ .wy-alert.wy-alert-info .wy-alert-title,
198
+ .rst-content .note .wy-alert-title,
199
+ .rst-content .wy-alert-info.attention .wy-alert-title,
200
+ .rst-content .wy-alert-info.caution .wy-alert-title,
201
+ .rst-content .wy-alert-info.danger .wy-alert-title,
202
+ .rst-content .wy-alert-info.error .wy-alert-title,
203
+ .rst-content .wy-alert-info.hint .wy-alert-title,
204
+ .rst-content .wy-alert-info.important .wy-alert-title,
205
+ .rst-content .wy-alert-info.tip .wy-alert-title,
206
+ .rst-content .wy-alert-info.warning .wy-alert-title,
207
+ .rst-content .seealso .wy-alert-title,
208
+ .rst-content .wy-alert-info.admonition-todo .wy-alert-title,
209
+ .rst-content .wy-alert-info.admonition .wy-alert-title,
210
+ .wy-alert.wy-alert-info .rst-content .admonition-title,
211
+ .rst-content .wy-alert.wy-alert-info .admonition-title,
212
+ .rst-content .note .admonition-title,
213
+ .rst-content .wy-alert-info.attention .admonition-title,
214
+ .rst-content .wy-alert-info.caution .admonition-title,
215
+ .rst-content .wy-alert-info.danger .admonition-title,
216
+ .rst-content .wy-alert-info.error .admonition-title,
217
+ .rst-content .wy-alert-info.hint .admonition-title,
218
+ .rst-content .wy-alert-info.important .admonition-title,
219
+ .rst-content .wy-alert-info.tip .admonition-title,
220
+ .rst-content .wy-alert-info.warning .admonition-title,
221
+ .rst-content .seealso .admonition-title,
222
+ .rst-content .wy-alert-info.admonition-todo .admonition-title,
223
+ .rst-content .wy-alert-info.admonition .admonition-title {
224
+ background: #76b900;
225
+ }
226
+
227
+ /* Background and Font Color of Alert Box Main Body*/
228
+ .wy-alert.wy-alert-info,
229
+ .rst-content .note,
230
+ .rst-content .wy-alert-info.attention,
231
+ .rst-content .wy-alert-info.caution,
232
+ .rst-content .wy-alert-info.danger,
233
+ .rst-content .wy-alert-info.error,
234
+ .rst-content .wy-alert-info.hint,
235
+ .rst-content .wy-alert-info.important,
236
+ .rst-content .wy-alert-info.tip,
237
+ .rst-content .wy-alert-info.warning,
238
+ .rst-content .seealso,
239
+ .rst-content .wy-alert-info.admonition-todo,
240
+ .rst-content .wy-alert-info.admonition {
241
+ background: #333333;
242
+ color: #999999;
243
+ }
244
+
245
+ .section {
246
+ margin-top: 50px;
247
+ }
248
+
249
+ /* Logo */
250
+ .navbar-brand-box {
251
+ background-color: #ffffff;
252
+ }
253
+
254
+ /* ---------------------------------------------- Media Queries --------------------------------------- */
255
+ @media (min-width: 1200px) {
256
+ .container-xl {
257
+ max-width: 100%;
258
+ }
259
+ }
260
+
261
+ @media (min-width: 1400px) {
262
+ body {
263
+ font-size: 18px;
264
+ }
265
+
266
+ #site-navigation nav ul.nav {
267
+ font-size: 18px;
268
+ }
269
+
270
+ #site-navigation nav.bd-links p {
271
+ font-size: 18px;
272
+ }
273
+
274
+ #site-navigation {
275
+ width: 350px;
276
+ }
277
+
278
+ .toc-h2 {
279
+ font-size: 18px;
280
+ }
281
+
282
+ .toc-h3 {
283
+ font-size: 1rem;
284
+ }
285
+
286
+ .toc-h4 {
287
+ font-size: 0.85rem;
288
+ }
289
+
290
+ .header-article .bd-toc {
291
+ font-size: 18px;
292
+ }
293
+
294
+ #main-content>div {
295
+ margin-left: 10%;
296
+ margin-right: 10%;
297
+ }
298
+ }
docs/source/_static/js/pk_scripts.js ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener("DOMContentLoaded", function () {
2
+ var params = window.location.search.substring(1).split("&").reduce(function (params, param) {
3
+ if (!param) {
4
+ return params;
5
+ }
6
+
7
+ var values = param.split("=");
8
+ var name = values[0];
9
+ var value = values[1];
10
+ params[name] = value;
11
+ return params;
12
+ }, {});
13
+
14
+ var form = document.getElementById("feedback-form");
15
+ for (var name in params) {
16
+ var input = form.querySelector("[name=" + name + "]");
17
+ input.value = params[name];
18
+ }
19
+ });
docs/source/_templates/layout.html ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "!layout.html" %}
2
+
3
+ {% block extrahead %}
4
+
5
+ <script type="text/javascript"
6
+ src="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js"></script>
7
+
8
+ {% endblock %}
9
+
10
+ {% block footer %}
11
+
12
+ <script type="text/javascript">_satellite.pageBottom();</script>
13
+
14
+ {% endblock %}
docs/source/asr/api.rst ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NeMo ASR collection API
2
+ =======================
3
+
4
+
5
+ Model Classes
6
+ -------------
7
+
8
+ .. autoclass:: nemo.collections.asr.models.EncDecCTCModel
9
+ :show-inheritance:
10
+ :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
11
+
12
+
13
+ .. autoclass:: nemo.collections.asr.models.EncDecCTCModelBPE
14
+ :show-inheritance:
15
+ :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
16
+
17
+
18
+ .. autoclass:: nemo.collections.asr.models.EncDecRNNTModel
19
+ :show-inheritance:
20
+ :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
21
+
22
+
23
+ .. autoclass:: nemo.collections.asr.models.EncDecRNNTBPEModel
24
+ :show-inheritance:
25
+ :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
26
+
27
+
28
+ .. autoclass:: nemo.collections.asr.models.EncDecClassificationModel
29
+ :show-inheritance:
30
+ :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
31
+
32
+
33
+ .. autoclass:: nemo.collections.asr.models.EncDecSpeakerLabelModel
34
+ :show-inheritance:
35
+ :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
36
+
37
+
38
+ Modules
39
+ -------
40
+
41
+ .. autoclass:: nemo.collections.asr.modules.ConvASREncoder
42
+ :show-inheritance:
43
+ :members:
44
+
45
+ .. autoclass:: nemo.collections.asr.modules.ConvASRDecoder
46
+ :show-inheritance:
47
+ :members:
48
+
49
+ .. autoclass:: nemo.collections.asr.modules.ConvASRDecoderClassification
50
+ :show-inheritance:
51
+ :members:
52
+
53
+ .. autoclass:: nemo.collections.asr.modules.SpeakerDecoder
54
+ :show-inheritance:
55
+ :members:
56
+
57
+ .. _conformer-encoder-api:
58
+
59
+ .. autoclass:: nemo.collections.asr.modules.ConformerEncoder
60
+ :show-inheritance:
61
+ :members:
62
+
63
+ .. _squeezeformer-encoder-api:
64
+
65
+ .. autoclass:: nemo.collections.asr.modules.SqueezeformerEncoder
66
+ :show-inheritance:
67
+ :members:
68
+
69
+ .. _rnn-encoder-api:
70
+
71
+ .. autoclass:: nemo.collections.asr.modules.RNNEncoder
72
+ :show-inheritance:
73
+ :members:
74
+
75
+ .. _rnnt-decoder-api:
76
+
77
+ .. autoclass:: nemo.collections.asr.modules.RNNTDecoder
78
+ :show-inheritance:
79
+ :members:
80
+
81
+ .. autoclass:: nemo.collections.asr.modules.StatelessTransducerDecoder
82
+ :show-inheritance:
83
+ :members:
84
+
85
+ .. _rnnt-joint-api:
86
+
87
+ .. autoclass:: nemo.collections.asr.modules.RNNTJoint
88
+ :show-inheritance:
89
+ :members:
90
+
91
+ .. autoclass:: nemo.collections.asr.modules.SampledRNNTJoint
92
+ :show-inheritance:
93
+ :members:
94
+
95
+
96
+
97
+ Parts
98
+ -----
99
+
100
+ .. autoclass:: nemo.collections.asr.parts.submodules.jasper.JasperBlock
101
+ :show-inheritance:
102
+ :members:
103
+
104
+
105
+ Mixins
106
+ ------
107
+
108
+ .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRBPEMixin
109
+ :show-inheritance:
110
+ :members:
111
+
112
+ .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
113
+ :show-inheritance:
114
+ :members:
115
+
116
+ .. autoclass:: nemo.collections.asr.parts.mixins.interctc_mixin.InterCTCMixin
117
+ :show-inheritance:
118
+ :members:
119
+
120
+ Datasets
121
+ --------
122
+
123
+ Character Encoding Datasets
124
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
125
+
126
+ .. autoclass:: nemo.collections.asr.data.audio_to_text.AudioToCharDataset
127
+ :show-inheritance:
128
+ :members:
129
+
130
+ .. autoclass:: nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset
131
+ :show-inheritance:
132
+ :members:
133
+
134
+ Subword Encoding Datasets
135
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
136
+
137
+ .. autoclass:: nemo.collections.asr.data.audio_to_text.AudioToBPEDataset
138
+ :show-inheritance:
139
+ :members:
140
+
141
+ .. autoclass:: nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset
142
+ :show-inheritance:
143
+ :members:
144
+
145
+ Audio Preprocessors
146
+ -------------------
147
+
148
+ .. autoclass:: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
149
+ :show-inheritance:
150
+ :members:
151
+
152
+ .. autoclass:: nemo.collections.asr.modules.AudioToMFCCPreprocessor
153
+ :show-inheritance:
154
+ :members:
155
+
156
+ Audio Augmentors
157
+ ----------------
158
+
159
+ .. autoclass:: nemo.collections.asr.modules.SpectrogramAugmentation
160
+ :show-inheritance:
161
+ :members:
162
+
163
+ .. autoclass:: nemo.collections.asr.modules.CropOrPadSpectrogramAugmentation
164
+ :show-inheritance:
165
+ :members:
166
+
167
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.SpeedPerturbation
168
+ :show-inheritance:
169
+ :members:
170
+
171
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TimeStretchPerturbation
172
+ :show-inheritance:
173
+ :members:
174
+
175
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.GainPerturbation
176
+ :show-inheritance:
177
+ :members:
178
+
179
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ImpulsePerturbation
180
+ :show-inheritance:
181
+ :members:
182
+
183
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ShiftPerturbation
184
+ :show-inheritance:
185
+ :members:
186
+
187
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.NoisePerturbation
188
+ :show-inheritance:
189
+ :members:
190
+
191
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.WhiteNoisePerturbation
192
+ :show-inheritance:
193
+ :members:
194
+
195
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.RirAndNoisePerturbation
196
+ :show-inheritance:
197
+ :members:
198
+
199
+ .. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TranscodePerturbation
200
+ :show-inheritance:
201
+ :members:
202
+
203
+ Miscellaneous Classes
204
+ ---------------------
205
+
206
+ CTC Decoding
207
+ ~~~~~~~~~~~~
208
+
209
+ .. autoclass:: nemo.collections.asr.metrics.wer.CTCDecoding
210
+ :show-inheritance:
211
+ :members:
212
+
213
+ .. autoclass:: nemo.collections.asr.metrics.wer_bpe.CTCBPEDecoding
214
+ :show-inheritance:
215
+ :members:
216
+
217
+ .. autoclass:: nemo.collections.asr.parts.submodules.ctc_greedy_decoding.GreedyCTCInfer
218
+ :show-inheritance:
219
+ :members:
220
+
221
+ .. autoclass:: nemo.collections.asr.parts.submodules.ctc_beam_decoding.BeamCTCInfer
222
+ :show-inheritance:
223
+ :members:
224
+
225
+ RNNT Decoding
226
+ ~~~~~~~~~~~~~
227
+
228
+ .. autoclass:: nemo.collections.asr.metrics.rnnt_wer.RNNTDecoding
229
+ :show-inheritance:
230
+ :members:
231
+
232
+ .. autoclass:: nemo.collections.asr.metrics.rnnt_wer_bpe.RNNTBPEDecoding
233
+ :show-inheritance:
234
+ :members:
235
+
236
+ .. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyRNNTInfer
237
+ :show-inheritance:
238
+ :members:
239
+
240
+ .. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyBatchedRNNTInfer
241
+ :show-inheritance:
242
+ :members:
243
+
244
+ .. autoclass:: nemo.collections.asr.parts.submodules.rnnt_beam_decoding.BeamRNNTInfer
245
+ :show-inheritance:
246
+ :members:
247
+
248
+ Hypotheses
249
+ ~~~~~~~~~~
250
+
251
+ .. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.Hypothesis
252
+ :show-inheritance:
253
+ :no-members:
254
+
255
+ .. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.NBestHypotheses
256
+ :show-inheritance:
257
+ :no-members:
258
+
259
+ Adapter Networks
260
+ ~~~~~~~~~~~~~~~~
261
+
262
+ .. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter
263
+ :show-inheritance:
264
+ :members:
265
+ :member-order: bysource
266
+
267
+ -----
268
+
269
+ .. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter
270
+ :show-inheritance:
271
+ :members:
272
+ :member-order: bysource
273
+
274
+ -----
275
+
276
+ .. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.PositionalEncodingAdapter
277
+ :show-inheritance:
278
+ :members:
279
+ :member-order: bysource
280
+
281
+ -----
282
+
283
+ .. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionalEncodingAdapter
284
+ :show-inheritance:
285
+ :members:
286
+ :member-order: bysource
287
+
288
+
289
+ Adapter Strategies
290
+ ~~~~~~~~~~~~~~~~~~
291
+
292
+ .. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MHAResidualAddAdapterStrategy
293
+ :show-inheritance:
294
+ :members:
295
+ :member-order: bysource
296
+ :undoc-members: adapter_module_names
297
+
298
+ -----
299
+
docs/source/asr/asr_all.bib ADDED
@@ -0,0 +1,1043 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @article{matchboxnet,
2
+ title={{MatchboxNet}: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition},
3
+ author={Majumdar, Somshubra and Ginsburg, Boris},
4
+ journal={Proc. Interspeech 2020},
5
+ year={2020}
6
+ }
7
+
8
+ @article{marblenet,
9
+ title={MarbleNet: Deep 1D Time-Channel Separable Convolutional Neural Network for Voice Activity Detection},
10
+ author={Jia, Fei and Majumdar, Somshubra and Ginsburg, Boris},
11
+ journal={arXiv preprint arXiv:2010.13886},
12
+ year={2020}
13
+ }
14
+
15
+ @inproceedings{panayotov2015librispeech,
16
+ title={Librispeech: an ASR corpus based on public domain audio books},
17
+ author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
18
+ booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
19
+ pages={5206--5210},
20
+ year={2015},
21
+ organization={IEEE}
22
+ }
23
+
24
+ @article{luong17,
25
+ author = {Minh{-}Thang Luong and Eugene Brevdo and Rui Zhao},
26
+ title = {Neural Machine Translation (seq2seq) Tutorial},
27
+ journal = {https://github.com/tensorflow/nmt},
28
+ year = {2017},
29
+ }
30
+
31
+ @INPROCEEDINGS{LaurentSeqWiseBN,
32
+ author={C. {Laurent} and G. {Pereyra} and P. {Brakel} and Y. {Zhang} and Y. {Bengio}},
33
+ booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
34
+ title={Batch normalized recurrent neural networks},
35
+ year={2016},
36
+ volume={},
37
+ number={},
38
+ pages={2657-2661},
39
+ keywords={feedforward neural nets;learning (artificial intelligence);recurrent neural nets;speech recognition;batch normalized recurrent neural networks;RNN;sequential data;long-term dependency learning;convergence rate improvement;intermediate representation normalization;feedforward neural networks;speech recognition task;language modeling;training criterion;Training;Recurrent neural networks;Convergence;Speech recognition;Computer architecture;Speech;batch normalization;RNN;LSTM;optimization},
40
+ doi={10.1109/ICASSP.2016.7472159},
41
+ ISSN={2379-190X},
42
+ month={March},}
43
+
44
+ @article{graves2005,
45
+ author = {Alex Graves and Jurgen Schmidhuber},
46
+ title = {Framewise phoneme classification with bidirectional LSTM and other neural network architectures},
47
+ journal = {Neural Networks, vol. 18},
48
+ pages={602–-610},
49
+ year = {2005},
50
+ }
51
+
52
+ @inproceedings{graves2006,
53
+ title={Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks},
54
+ author={Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino and Schmidhuber, J{\"u}rgen},
55
+ booktitle={Proceedings of the 23rd international conference on Machine learning},
56
+ pages={369--376},
57
+ year={2006},
58
+ organization={ACM}
59
+ }
60
+
61
+ @article{li2019jasper,
62
+ title={Jasper: An End-to-End Convolutional Neural Acoustic Model},
63
+ author={Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris and Leary, Ryan and Kuchaiev, Oleksii and Cohen, Jonathan M and Nguyen, Huyen and Gadde, Ravi Teja},
64
+ journal={arXiv preprint arXiv:1904.03288},
65
+ year={2019}
66
+ }
67
+
68
+ @misc{ardila2019common,
69
+ title={Common Voice: A Massively-Multilingual Speech Corpus},
70
+ author={Rosana Ardila and Megan Branson and Kelly Davis and Michael Henretty and Michael Kohler and Josh Meyer and Reuben Morais and Lindsay Saunders and Francis M. Tyers and Gregor Weber},
71
+ year={2019},
72
+ eprint={1912.06670},
73
+ archivePrefix={arXiv},
74
+ primaryClass={cs.CL}
75
+ }
76
+
77
+ @article{graves2012,
78
+ title={Sequence Transduction with Recurrent Neural Networks},
79
+ author={Graves, Alex},
80
+ journal={arXiv preprint arXiv:1211.3711},
81
+ year={2012}
82
+ }
83
+
84
+
85
+ @article{graves2013,
86
+ title={Generating sequences with recurrent neural networks},
87
+ author={Graves, Alex},
88
+ journal={arXiv preprint arXiv:1308.0850},
89
+ year={2013}
90
+ }
91
+
92
+ @article{sergeev2018horovod,
93
+ title={Horovod: fast and easy distributed deep learning in TensorFlow},
94
+ author={Sergeev, Alexander and Del Balso, Mike},
95
+ journal={arXiv preprint arXiv:1802.05799},
96
+ year={2018}
97
+ }
98
+
99
+ @misc{NVVolta,
100
+ title = {NVIDIA TESLA V100 GPU ARCHITECTURE},
101
+ howpublished = {\url{http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf}},
102
+ note = {Accessed: 2018-10-09}
103
+ }
104
+
105
+ @article{NVTuring,
106
+ title = {NVIDIA TURING GPU ARCHITECTURE},
107
+ howpublished = {\url{https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf}},
108
+ author = {NVIDIA},
109
+ year = {2018},
110
+ note = {Accessed: 2018-10-09}
111
+ }
112
+
113
+ @misc{Rygaard2015,
114
+ title = {Using Synthesized Speech to Improve Speech Recognition for Low-Resource Languages},
115
+ author = {Luise Valentin Rygaard},
116
+ howpublished = {\url{https://parasol.tamu.edu/dreu2015/Rygaard/report.pdf}},
117
+ year = {2015},
118
+ }
119
+
120
+ @misc{OpenSeq2Seq,
121
+ title = {OpenSeq2Seq: extensible toolkit for distributed and mixed precision training of sequence-to-sequence models},
122
+ author = {Kuchaiev, Oleksii and Ginsburg, Boris and Gitman, Igor and Lavrukhin,Vitaly and Case, Carl and Micikevicius, Paulius},
123
+ howpublished = {\url{https://arxiv.org/abs/1805.10387}},
124
+ year = {2018},
125
+ }
126
+
127
+ @misc{MPGuide,
128
+ title = {Training with Mixed Precision},
129
+ howpublished = {\url{http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/}},
130
+ note = {Accessed: 2018-04-06},
131
+ }
132
+
133
+ @misc{Mozilla,
134
+ title = {Mozilla: A Journey to less than 10\% Word Error Rate},
135
+ howpublished = {\url{https://hacks.mozilla.org/2017/11/a-journey-to-10-word-error-rate/}},
136
+ note = {Accessed: 2018-04-06},
137
+ }
138
+
139
+ @article{Waibel1989,
140
+ title={A time-delay neural network architecture for isolated word recognition},
141
+ author={Waibel, Alexander, and Hanazawa, Toshiyki and Hinton,Geoffrey and Shirano, Kiyohiro and Lang, Kevin },
142
+ journal={IEEE Trans. on Acoustics, Speech and Signal Processing},
143
+ year={1989}
144
+ }
145
+
146
+ @article{Lang1990,
147
+ title={A time-delay neural network architecture for isolated word recognition},
148
+ author={Lang, Kevin and Waibel, Alexander, and Hinton,Geoffrey },
149
+ journal={Neural Networks},
150
+ year={1990}
151
+ }
152
+
153
+ @book{Bengio1996,
154
+ Author = {Bengio, Y.},
155
+ Publisher = {International Thomson Computer Press},
156
+ Title = {Neural Networks for Speech and Sequence Recognition},
157
+ Year = {1996}
158
+ }
159
+
160
+ @article{Bengio1992,
161
+ title={Global optimization of a neural network-hidden Markov model hybrid},
162
+ author={Bengio, Y., and De Mori, R., and Flammia, G., and Kompe, R. },
163
+ journal={IEEE Transactions on Neural Networks, 3(2), 252–259},
164
+ year={1992}
165
+ }
166
+
167
+ @article{Bourlard1994,
168
+ title={Connectionist speech recognition: a hybrid approach},
169
+ author={Bourlard, H. A. and Morgan, N.},
170
+ journal={volume 247 Springer },
171
+ year={1994}
172
+ }
173
+
174
+ @article{srivastava14a,
175
+ author = {Nitish Srivastava, and Geoffrey Hinton, and Alex Krizhevsky, and Ilya Sutskever, and Ruslan Salakhutdinov},
176
+ title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
177
+ journal = {Journal of Machine Learning Research},
178
+ year = {2014},
179
+ volume = {15},
180
+ pages = {1929-1958},
181
+ url = {http://jmlr.org/papers/v15/srivastava14a.html}
182
+ }
183
+
184
+
185
+ @article{Hinton2012,
186
+ title={Deep Neural Networks for Acoustic Modeling in Speech Recognition},
187
+ author={ Hinton,Geoffrey and Deng, Li and Yu, Dong and Dahl,George and Mohamed,Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Kingsbury, Brian and Sainath, Tara},
188
+ journal={IEEE Signal Processing Magazine},
189
+ year={2012}
190
+ }
191
+
192
+ @article{Graves2014,
193
+ title={Towards End-to-End Speech Recognition with Recurrent Neural Networks},
194
+ author={Graves, Alex and Jaitly, Navdeep},
195
+ journal={International Conference on Machine Learning},
196
+ year={2014}
197
+ }
198
+
199
+ @article{Chorowski2014,
200
+ title={End-to-end Continuous Speech Recognition using Attention-based Recurrent NN: First Results},
201
+ author={ Chorowski, Jan, and Bahdanau, Dzmitry , and Cho, Kyunghyun , and Bengio, Yoshua },
202
+ journal={Neural Information Processing Systems: Workshop Deep Learning and Representation Learning Workshop },
203
+ year={2014}
204
+ }
205
+
206
+ @article{Sak2014,
207
+ title={Long short-term memory recurrent neural network architectures for large scale acoustic modeling},
208
+ author={Sak, Hasim and Senior, Andrew and Beaufays, Francoise },
209
+ journal={Interspeech 2014},
210
+ year={2014}
211
+ }
212
+
213
+ @article{Ko2015,
214
+ title={Audio Augmentation for Speech Recognition},
215
+ author={Tom, Ko and Vijayaditya, Peddinti and Daniel, Povey
216
+ and Sanjeev, Khudanpur },
217
+ journal={Interspeech 2015},
218
+ year={2015}
219
+ }
220
+
221
+ @article{Tjandra2017,
222
+ title={Listening while Speaking: Speech Chain by Deep Learning},
223
+ author={Andros, Tjandra and Sakriani, Sakti and Satoshi, Nakamura },
224
+ journal={ASRU 2017},
225
+ year={2017}
226
+ }
227
+
228
+ @article{Tjandra2018,
229
+ title={Machine Speech Chain with One-shot Speaker Adaptation},
230
+ author={Andros, Tjandra and Sakriani, Sakti and Satoshi, Nakamura },
231
+ journal={Interspeech 2018},
232
+ year={2018}
233
+ }
234
+
235
+ @article{bahdanau2014neural,
236
+ title={Neural machine translation by jointly learning to align and translate},
237
+ author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
238
+ journal={arXiv preprint arXiv:1409.0473},
239
+ year={2014}
240
+ }
241
+
242
+ @article{cho2014learning,
243
+ title={Learning phrase representations using RNN encoder-decoder for statistical machine translation},
244
+ author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
245
+ journal={arXiv preprint arXiv:1406.1078},
246
+ year={2014}
247
+ }
248
+
249
+ @article{rush2015neural,
250
+ title={A neural attention model for abstractive sentence summarization},
251
+ author={Rush, Alexander M and Chopra, Sumit and Weston, Jason},
252
+ journal={arXiv preprint arXiv:1509.00685},
253
+ year={2015}
254
+ }
255
+
256
+ @article{micikevicius2017mixed,
257
+ title={Mixed precision training},
258
+ author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaev, Oleksii and Venkatesh, Ganesh and others},
259
+ journal={arXiv preprint arXiv:1710.03740},
260
+ year={2017}
261
+ }
262
+
263
+ @ARTICLE{Britz:2017,
264
+ author = {{Britz}, Denny and {Goldie}, Anna and {Luong}, Thang and {Le}, Quoc},
265
+ title = {Massive Exploration of Neural Machine Translation Architectures},
266
+ journal = {ArXiv e-prints arXiv:1703.03906},
267
+ archivePrefix = "arXiv",
268
+ eprinttype = {arxiv},
269
+ eprint = {1703.03906},
270
+ primaryClass = "cs.CL",
271
+ keywords = {Computer Science - Computation and Language},
272
+ year = 2017,
273
+ month = mar
274
+ }
275
+
276
+ @inproceedings{abadi2016tensorflow,
277
+ title={TensorFlow: A System for Large-Scale Machine Learning.},
278
+ author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others},
279
+ booktitle={OSDI},
280
+ volume={16},
281
+ pages={265--283},
282
+ year={2016}
283
+ }
284
+
285
+ @article{tensor2tensor,
286
+ author = {Ashish Vaswani and Samy Bengio and Eugene Brevdo and Francois Chollet and Aidan N. Gomez and Stephan Gouws and Llion Jones and \L{}ukasz Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and
287
+ Noam Shazeer and Jakob Uszkoreit},
288
+ title = {Tensor2Tensor for Neural Machine Translation},
289
+ journal = {CoRR},
290
+ volume = {abs/1803.07416},
291
+ year = {2018},
292
+ url = {http://arxiv.org/abs/1803.07416},
293
+ }
294
+
295
+ @article{gehring2017convs2s,
296
+ author = {Gehring, Jonas, and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N},
297
+ title = "{Convolutional Sequence to Sequence Learning}",
298
+ journal = {ArXiv e-prints arXiv:1705.03122},
299
+ archivePrefix = "arXiv",
300
+ eprinttype = {arxiv},
301
+ eprint = {1705.03122},
302
+ primaryClass = "cs.CL",
303
+ keywords = {Computer Science - Computation and Language},
304
+ year = 2017,
305
+ month = May,
306
+ }
307
+
308
+ @inproceedings{chan2015,
309
+ title={Listen, attend and spell},
310
+ author={Chan, William and Jaitly, Navdeep and Le, Quoc V and Vinyals, Oriol},
311
+ booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2016 IEEE International Conference on},
312
+ pages={5206--5210},
313
+ year={2016},
314
+ organization={IEEE}
315
+ }
316
+
317
+ @inproceedings{xu2015show,
318
+ title={Show, attend and tell: Neural image caption generation with visual attention},
319
+ author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua},
320
+ booktitle={International Conference on Machine Learning},
321
+ pages={2048--2057},
322
+ year={2015}
323
+ }
324
+
325
+ @incollection{Sutskever2014,
326
+ title = {Sequence to Sequence Learning with Neural Networks},
327
+ author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},
328
+ booktitle = {Advances in Neural Information Processing Systems 27},
329
+ editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger},
330
+ pages = {3104--3112},
331
+ year = {2014},
332
+ publisher = {Curran Associates, Inc.},
333
+ url = {http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf}
334
+ }
335
+
336
+ @article{DeepSpeech2014,
337
+ title = {Deep Speech: Scaling up end-to-end speech recognition},
338
+ author = {Awni Y. Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
339
+ journal = {CoRR},
340
+ volume = {abs/1412.5567},
341
+ year = {2014},
342
+ url = {http://arxiv.org/abs/1412.5567},
343
+ archivePrefix = {arXiv},
344
+ eprint = {1412.5567},
345
+ timestamp = {Mon, 13 Aug 2018 16:48:07 +0200},
346
+ biburl = {https://dblp.org/rec/bib/journals/corr/HannunCCCDEPSSCN14},
347
+ bibsource = {dblp computer science bibliography, https://dblp.org}
348
+ }
349
+
350
+ @inproceedings{DeepSpeech2,
351
+ author = {Amodei, Dario and Ananthanarayanan, Sundaram and Anubhai, Rishita and Bai, Jingliang and Battenberg, Eric and Case, Carl and Casper, Jared and Catanzaro, Bryan and Cheng, Qiang and Chen, Guoliang and Chen, Jie and Chen, Jingdong and Chen, Zhijie and Chrzanowski, Mike and Coates, Adam and Diamos, Greg and Ding, Ke and Du, Niandong and Elsen, Erich and Engel, Jesse and Fang, Weiwei and Fan, Linxi and Fougner, Christopher and Gao, Liang and Gong, Caixia and Hannun, Awni and Han, Tony and Johannes, Lappi Vaino and Jiang, Bing and Ju, Cai and Jun, Billy and LeGresley, Patrick and Lin, Libby and Liu, Junjie and Liu, Yang and Li, Weigao and Li, Xiangang and Ma, Dongpeng and Narang, Sharan and Ng, Andrew and Ozair, Sherjil and Peng, Yiping and Prenger, Ryan and Qian, Sheng and Quan, Zongfeng and Raiman, Jonathan and Rao, Vinay and Satheesh, Sanjeev and Seetapun, David and Sengupta, Shubho and Srinet, Kavya and Sriram, Anuroop and Tang, Haiyuan and Tang, Liliang and Wang, Chong and Wang, Jidong and Wang, Kaifu and Wang, Yi and Wang, Zhijian and Wang, Zhiqian and Wu, Shuang and Wei, Likai and Xiao, Bo and Xie, Wen and Xie, Yan and Yogatama, Dani and Yuan, Bin and Zhan, Jun and Zhu, Zhenyao},
352
+ title = {Deep Speech 2: End-to-end Speech Recognition in English and Mandarin},
353
+ booktitle = {Proceedings of the 33rd International Conference on International Conference on Machine Learning - Volume 48},
354
+ series = {ICML'16},
355
+ year = {2016},
356
+ location = {New York, NY, USA},
357
+ pages = {173--182},
358
+ numpages = {10},
359
+ url = {http://dl.acm.org/citation.cfm?id=3045390.3045410},
360
+ acmid = {3045410},
361
+ publisher = {JMLR.org},
362
+ }
363
+
364
+ @inproceedings{prabhavalkar2017comparison,
365
+ title={A comparison of sequence-to-sequence models for speech recognition},
366
+ author={Prabhavalkar, Rohit and Rao, Kanishka and Sainath, Tara N and Li, Bo and Johnson, Leif and Jaitly, Navdeep},
367
+ booktitle={Proc. Interspeech},
368
+ pages={939--943},
369
+ year={2017}
370
+ }
371
+
372
+ @article{chiu2017state,
373
+ title={State-of-the-art speech recognition with sequence-to-sequence models},
374
+ author={Chiu, Chung-Cheng and Sainath, Tara N and Wu, Yonghui and Prabhavalkar, Rohit and Nguyen, Patrick and Chen, Zhifeng and Kannan, Anjuli and Weiss, Ron J and Rao, Kanishka and Gonina, Katya and others},
375
+ journal={arXiv preprint arXiv:1712.01769},
376
+ year={2017}
377
+ }
378
+
379
+ @misc{NVMixed,
380
+ title = {{NVIDA's Mixed-Precision Training - TensorFlow example}},
381
+ howpublished = {\url{https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/#example_tensorflow}},
382
+ author={NVIDIA},
383
+ note = {Accessed: 2018-10-09},
384
+ year={2018}
385
+ }
386
+
387
+ @article{gehring2017,
388
+ title={Convolutional sequence to sequence learning},
389
+ author={Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N},
390
+ journal={arXiv preprint arXiv:1705.03122},
391
+ year={2017}
392
+ }
393
+
394
+ @article{collobert2016,
395
+ title={Wav2letter: an end-to-end convnet-based speech recognition system},
396
+ author={Collobert, Ronan and Puhrsch, Christian and Synnaeve, Gabriel},
397
+ journal={arXiv preprint arXiv:1609.03193},
398
+ year={2016}
399
+ }
400
+
401
+ @inproceedings{Zhang2016,
402
+ author={Ying Zhang and Mohammad Pezeshki and Philémon Brakel and Saizheng Zhang and César Laurent and Yoshua Bengio and Aaron Courville},
403
+ title={Towards End-to-End Speech Recognition with Deep Convolutional Neural Networks},
404
+ year=2016,
405
+ booktitle={Interspeech 2016},
406
+ doi={10.21437/Interspeech.2016-1446},
407
+ url={http://dx.doi.org/10.21437/Interspeech.2016-1446},
408
+ pages={410--414}
409
+ }
410
+
411
+ @inproceedings{Zhang2017,
412
+ title={Very deep convolutional networks for end-to-end speech recognition},
413
+ author={Zhang, Yu, and Chan, William, and Jaitly, Navdeep},
414
+ booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2017 IEEE International Conference on},
415
+ year={2017},
416
+ organization={IEEE}
417
+ }
418
+
419
+
420
+ @article{Wang2017,
421
+ title={Tacotron: Towards End-to-End Speech Synthesis},
422
+ author={ Wang, Yuxuan, and Skerry-Ryan, RJ, and Stanton, Daisy and Wu, Yonghui and Weiss, Ron, and Jaitly, Navdeep and Yang, Zongheng and Xiao, Ying and Chen,Zhifeng and Bengio, Samy and Le, Quoc and Agiomyrgiannakis, Yannis and Clark,Rob and Saurous, Rif A.},
423
+ journal={arXiv preprint arXiv:1703.10135},
424
+ year={2017}
425
+ }
426
+
427
+ @article{griffin1984signal,
428
+ title={Signal estimation from modified short-time Fourier transform},
429
+ author={Griffin, Daniel and Lim, Jae},
430
+ journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
431
+ volume={32},
432
+ number={2},
433
+ pages={236--243},
434
+ year={1984},
435
+ publisher={IEEE}
436
+ }
437
+
438
+ @misc{ito2017lj,
439
+ title={The LJ speech dataset},
440
+ author={Ito, Keith and others},
441
+ year={2017}
442
+ }
443
+
444
+ @misc{mailabs,
445
+ title = {{The M-AILABS Speech Dataset}},
446
+ howpublished = {\url{http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/}},
447
+ author={M-AILABS},
448
+ note = {Accessed: 2018-10-09},
449
+ year={2018}
450
+ }
451
+
452
+ @article{merity2016pointer,
453
+ title={Pointer sentinel mixture models},
454
+ author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard},
455
+ journal={arXiv preprint arXiv:1609.07843},
456
+ year={2016}
457
+ }
458
+
459
+ @inproceedings{socher2013recursive,
460
+ title={Recursive deep models for semantic compositionality over a sentiment treebank},
461
+ author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew and Potts, Christopher},
462
+ booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing},
463
+ pages={1631--1642},
464
+ year={2013}
465
+ }
466
+
467
+ @InProceedings{maas-EtAl:2011:ACL-HLT2011,
468
+ author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
469
+ title = {Learning Word Vectors for Sentiment Analysis},
470
+ booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
471
+ month = {June},
472
+ year = {2011},
473
+ address = {Portland, Oregon, USA},
474
+ publisher = {Association for Computational Linguistics},
475
+ pages = {142--150},
476
+ url = {http://www.aclweb.org/anthology/P11-1015}
477
+ }
478
+
479
+ @inproceedings{Povey2018SemiOrthogonalLM,
480
+ title={Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks},
481
+ author={Daniel Povey and Gaofeng Cheng and Yiming Wang and Ke Li and Hainan Xu and Mahsa Yarmohammadi and Sanjeev Khudanpur},
482
+ booktitle={Interspeech},
483
+ year={2018}
484
+ }
485
+
486
+ @article{CAPIO2017,
487
+ author = {Kyu J. Han and Akshay Chandrashekaran and Jungsuk Kim and Ian R. Lane},
488
+ title = {The {CAPIO} 2017 Conversational Speech Recognition System},
489
+ journal = {CoRR},
490
+ volume = {abs/1801.00059},
491
+ year = {2018},
492
+ url = {http://arxiv.org/abs/1801.00059},
493
+ archivePrefix = {arXiv},
494
+ eprint = {1801.00059},
495
+ timestamp = {Mon, 13 Aug 2018 16:49:10 +0200},
496
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1801-00059},
497
+ bibsource = {dblp computer science bibliography, https://dblp.org}
498
+ }
499
+
500
+ @article{WaveNet,
501
+ author = {A{\"{a}}ron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew W. Senior and Koray Kavukcuoglu},
502
+ title = {WaveNet: {A} Generative Model for Raw Audio},
503
+ journal = {CoRR},
504
+ volume = {abs/1609.03499},
505
+ year = {2016},
506
+ url = {http://arxiv.org/abs/1609.03499},
507
+ archivePrefix = {arXiv},
508
+ eprint = {1609.03499},
509
+ timestamp = {Mon, 13 Aug 2018 16:49:15 +0200},
510
+ biburl = {https://dblp.org/rec/bib/journals/corr/OordDZSVGKSK16},
511
+ bibsource = {dblp computer science bibliography, https://dblp.org}
512
+ }
513
+
514
+ @article{FacebookGERENGBackTranslation,
515
+ author = {Rico Sennrich and Barry Haddow and Alexandra Birch},
516
+ title = {Improving Neural Machine Translation Models with Monolingual Data},
517
+ journal = {CoRR},
518
+ volume = {abs/1511.06709},
519
+ year = {2015},
520
+ url = {http://arxiv.org/abs/1511.06709},
521
+ archivePrefix = {arXiv},
522
+ eprint = {1511.06709},
523
+ timestamp = {Mon, 13 Aug 2018 16:47:05 +0200},
524
+ biburl = {https://dblp.org/rec/bib/journals/corr/SennrichHB15a},
525
+ bibsource = {dblp computer science bibliography, https://dblp.org}
526
+ }
527
+
528
+ @article{GlobalStyleTokens,
529
+ author = {Yuxuan Wang and Daisy Stanton and Yu Zhang and R. J. Skerry{-}Ryan and Eric Battenberg and Joel Shor and Ying Xiao and Fei Ren and Ye Jia and Rif A. Saurous},
530
+ title = {Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis},
531
+ journal = {CoRR},
532
+ volume = {abs/1803.09017},
533
+ year = {2018},
534
+ url = {http://arxiv.org/abs/1803.09017},
535
+ archivePrefix = {arXiv},
536
+ eprint = {1803.09017},
537
+ timestamp = {Mon, 13 Aug 2018 16:46:53 +0200},
538
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-09017},
539
+ bibsource = {dblp computer science bibliography, https://dblp.org}
540
+ }
541
+
542
+ @article{IoffeS15BatchNorm,
543
+ author = {Sergey Ioffe and Christian Szegedy},
544
+ title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
545
+ journal = {CoRR},
546
+ volume = {abs/1502.03167},
547
+ year = {2015},
548
+ url = {http://arxiv.org/abs/1502.03167},
549
+ archivePrefix = {arXiv},
550
+ eprint = {1502.03167},
551
+ timestamp = {Mon, 13 Aug 2018 16:47:06 +0200},
552
+ biburl = {https://dblp.org/rec/bib/journals/corr/IoffeS15},
553
+ bibsource = {dblp computer science bibliography, https://dblp.org}
554
+ }
555
+
556
+ @article{kingma,
557
+ author = {Diederik P. Kingma and
558
+ Jimmy Ba},
559
+ title = {Adam: {A} Method for Stochastic Optimization},
560
+ journal = {CoRR},
561
+ volume = {abs/1412.6980},
562
+ year = {2014},
563
+ url = {http://arxiv.org/abs/1412.6980},
564
+ archivePrefix = {arXiv},
565
+ eprint = {1412.6980},
566
+ timestamp = {Mon, 13 Aug 2018 01:00:00 +0200},
567
+ biburl = {https://dblp.org/rec/bib/journals/corr/KingmaB14},
568
+ bibsource = {dblp computer science bibliography, https://dblp.org}
569
+ }
570
+
571
+ @incollection{Salimans2016WeightNorm,
572
+ title = {Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks},
573
+ author = {Salimans, Tim and Kingma, Durk P},
574
+ booktitle = {Advances in Neural Information Processing Systems 29},
575
+ editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett},
576
+ pages = {901--909},
577
+ year = {2016},
578
+ publisher = {Curran Associates, Inc.},
579
+ url = {http://papers.nips.cc/paper/6114-weight-normalization-a-simple-reparameterization-to-accelerate-training-of-deep-neural-networks.pdf}
580
+ }
581
+
582
+ @article{wu2016google,
583
+ title={Google's neural machine translation system: Bridging the gap between human and machine translation},
584
+ author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Zolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others},
585
+ journal={arXiv preprint arXiv:1609.08144},
586
+ year={2016}
587
+ }
588
+
589
+ @inproceedings{opennmt,
590
+ author = {Guillaume Klein and Yoon Kim and Yuntian Deng and Jean Senellart and Alexander M. Rush},
591
+ title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
592
+ booktitle = {Proc. ACL},
593
+ year = {2017},
594
+ url = {https://doi.org/10.18653/v1/P17-4012},
595
+ doi = {10.18653/v1/P17-4012}
596
+ }
597
+
598
+ @article{paszke2017automatic,
599
+ title={Automatic differentiation in PyTorch},
600
+ author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
601
+ year={2017}
602
+ }
603
+
604
+ @article{yu2014introduction,
605
+ title={An introduction to computational networks and the computational network toolkit},
606
+ author={Yu, Dong and Eversole, Adam and Seltzer, Mike and Yao, Kaisheng and Huang, Zhiheng and Guenter, Brian and Kuchaiev, Oleksii and Zhang, Yu and Seide, Frank and Wang, Huaming and others},
607
+ journal={Microsoft Technical Report MSR-TR-2014--112},
608
+ year={2014}
609
+ }
610
+
611
+ @article{nvidia2017v100,
612
+ title={V100 GPU architecture. The world’s most advanced data center GPU. Version WP-08608-001\_v1. 1},
613
+ author={NVIDIA, Tesla},
614
+ journal={NVIDIA. Aug},
615
+ pages={108},
616
+ year={2017}
617
+ }
618
+
619
+ @article{Ba2016LayerNorm,
620
+ author = {Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E Hinton},
621
+ title = {Layer normalization},
622
+ journal = {CoRR},
623
+ volume = {abs/1607.06450},
624
+ year = {2016},
625
+ url = {http://arxiv.org/abs/1607.06450},
626
+ archivePrefix = {arXiv},
627
+ }
628
+
629
+ @inproceedings{Dauphin2017GLU,
630
+ author = {Dauphin, Yann N. and Fan, Angela and Auli, Michael and Grangier, David},
631
+ title = {Language Modeling with Gated Convolutional Networks},
632
+ booktitle = {Proceedings of the 34th International Conference on Machine Learning - Volume 70},
633
+ series = {ICML'17},
634
+ year = {2017},
635
+ location = {Sydney, NSW, Australia},
636
+ pages = {933--941},
637
+ numpages = {9},
638
+ url = {http://dl.acm.org/citation.cfm?id=3305381.3305478},
639
+ acmid = {3305478},
640
+ publisher = {JMLR.org},
641
+ }
642
+
643
+ @incollection{Oord2016PixelCNN,
644
+ title = {Conditional Image Generation with PixelCNN Decoders},
645
+ author = {van den Oord, Aaron and Kalchbrenner, Nal and Espeholt, Lasse and kavukcuoglu, koray and Vinyals, Oriol and Graves, Alex},
646
+ booktitle = {Advances in Neural Information Processing Systems 29},
647
+ editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett},
648
+ pages = {4790--4798},
649
+ year = {2016},
650
+ publisher = {Curran Associates, Inc.},
651
+ url = {http://papers.nips.cc/paper/6527-conditional-image-generation-with-pixelcnn-decoders.pdf}
652
+ }
653
+
654
+ @article{he2015,
655
+ title={Deep residual learning for image recognition},
656
+ author={K. He, and X. Zhang, and S. Ren, and J. Sun},
657
+ journal={arXiv preprint arXiv:1512.03385},
658
+ year={2015}
659
+ }
660
+
661
+ @article{huang2016,
662
+ title={Densely Connected Convolutional Networks},
663
+ author={Gao Huang, and Zhuang Liu, and Laurens van der Maaten, and Kilian Q. Weinberger},
664
+ journal={arXiv preprint arXiv:1608.06993},
665
+ year={2016}
666
+ }
667
+
668
+ @inproceedings{heafield2011kenlm,
669
+ title={KenLM: Faster and smaller language model queries},
670
+ author={Heafield, Kenneth},
671
+ booktitle={Proceedings of the sixth workshop on statistical machine translation},
672
+ pages={187--197},
673
+ year={2011},
674
+ organization={Association for Computational Linguistics}
675
+ }
676
+
677
+ @article{dai2018transformer,
678
+ title={Transformer-XL: Language Modeling with Longer-Term Dependency},
679
+ author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
680
+ year={2018},
681
+ journal = {CoRR},
682
+ volume = {abs/1901.02860},
683
+ url = {http://arxiv.org/abs/1901.02860},
684
+ archivePrefix = {arXiv},
685
+ eprint = {1901.02860},
686
+ timestamp = {Fri, 01 Feb 2019 13:39:59 +0100},
687
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1901-02860},
688
+ bibsource = {dblp computer science bibliography, https://dblp.org}
689
+ }
690
+
691
+ @inproceedings{Saon+2016,
692
+ author={George Saon and Tom Sercu and Steven Rennie and Hong-Kwang J. Kuo},
693
+ title={The IBM 2016 English Conversational Telephone Speech Recognition System},
694
+ year=2016,
695
+ booktitle={Interspeech 2016},
696
+ doi={10.21437/Interspeech.2016-1460},
697
+ url={http://dx.doi.org/10.21437/Interspeech.2016-1460},
698
+ pages={7--11}
699
+ }
700
+
701
+ @INPROCEEDINGS{Sercu-2016,
702
+ author={T. {Sercu} and C. {Puhrsch} and B. {Kingsbury} and Y. {LeCun}},
703
+ booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
704
+ title={Very deep multilingual convolutional neural networks for LVCSR},
705
+ year={2016},
706
+ volume={},
707
+ number={},
708
+ pages={4955-4959},
709
+ keywords={natural language processing;neural nets;speech recognition;very deep multilingual convolutional neural networks;LVCSR;CNN;large vocabulary continuous speech recognition systems;word error rate;Training;Context;Hidden Markov models;Neural networks;Computer architecture;Kernel;Training data;Convolutional Networks;Multilingual;Acoustic Modeling;Speech Recognition;Neural Networks},
710
+ doi={10.1109/ICASSP.2016.7472620},
711
+ ISSN={2379-190X},
712
+ month={March},}
713
+
714
+
715
+ @inproceedings{Sercu+2016,
716
+ author={Tom Sercu and Vaibhava Goel},
717
+ title={Advances in Very Deep Convolutional Neural Networks for LVCSR},
718
+ year=2016,
719
+ booktitle={Interspeech 2016},
720
+ doi={10.21437/Interspeech.2016-1033},
721
+ url={http://dx.doi.org/10.21437/Interspeech.2016-1033},
722
+ pages={3429--3433}
723
+ }
724
+
725
+ @INPROCEEDINGS{Xiong-2018,
726
+ author={W. {Xiong} and L. {Wu} and F. {Alleva} and J. {Droppo} and X. {Huang} and A. {Stolcke}},
727
+ booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
728
+ title={The Microsoft 2017 Conversational Speech Recognition System},
729
+ year={2018},
730
+ volume={},
731
+ number={},
732
+ pages={5934-5938},
733
+ keywords={convolution;feedforward neural nets;natural language processing;speaker recognition;speech processing;language model rescoring step;senone level;switchboard domains;character-based LSTM language models;NIST 2000 switchboard test set;frame level;word-level voting;acoustic model posteriors;dialog session aware LSTM language models;CNN-BLSTM acoustic model;Microsoft 2017 conversational speech recognition system;Acoustics;Error analysis;Training;Speech recognition;Switches;Computational modeling;Context modeling;Conversational speech recognition;CNN;LACE;BLSTM;LSTM-LM;system combination;human parity},
734
+ doi={10.1109/ICASSP.2018.8461870},
735
+ ISSN={2379-190X},
736
+ month={April},}
737
+
738
+ @inproceedings{zeyer2018improved,
739
+ author={Albert Zeyer and Kazuki Irie and Ralf Schlüter and Hermann Ney},
740
+ title={Improved Training of End-to-end Attention Models for Speech Recognition},
741
+ year=2018,
742
+ booktitle={Proc. Interspeech 2018},
743
+ pages={7--11},
744
+ doi={10.21437/Interspeech.2018-1616},
745
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1616}
746
+ }
747
+
748
+ @article{Wav2LetterV2,
749
+ author = {Vitaliy Liptchinsky and
750
+ Gabriel Synnaeve and
751
+ Ronan Collobert},
752
+ title = {Letter-Based Speech Recognition with Gated ConvNets},
753
+ journal = {CoRR},
754
+ volume = {abs/1712.09444},
755
+ year = {2017},
756
+ url = {http://arxiv.org/abs/1712.09444},
757
+ archivePrefix = {arXiv},
758
+ eprint = {1712.09444},
759
+ timestamp = {Mon, 13 Aug 2018 16:46:33 +0200},
760
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1712-09444},
761
+ bibsource = {dblp computer science bibliography, https://dblp.org}
762
+ }
763
+
764
+ @article{zeghidour2018,
765
+ author = {Neil Zeghidour and
766
+ Qiantong Xu and
767
+ Vitaliy Liptchinsky and
768
+ Nicolas Usunier and
769
+ Gabriel Synnaeve and
770
+ Ronan Collobert},
771
+ title = {Fully Convolutional Speech Recognition},
772
+ journal = {CoRR},
773
+ volume = {abs/1812.06864},
774
+ year = {2018},
775
+ url = {http://arxiv.org/abs/1812.06864},
776
+ archivePrefix = {arXiv},
777
+ eprint = {1812.06864},
778
+ timestamp = {Tue, 01 Jan 2019 15:01:25 +0100},
779
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1812-06864},
780
+ bibsource = {dblp computer science bibliography, https://dblp.org}
781
+ }
782
+
783
+ @inproceedings{Hadian2018,
784
+ author={Hossein Hadian and Hossein Sameti and Daniel Povey and Sanjeev Khudanpur},
785
+ title={End-to-end Speech Recognition Using Lattice-free MMI},
786
+ year=2018,
787
+ booktitle={Proc. Interspeech 2018},
788
+ pages={12--16},
789
+ doi={10.21437/Interspeech.2018-1423},
790
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1423}
791
+ }
792
+
793
+ @inproceedings{Tang2018,
794
+ author={Jian Tang and Yan Song and Lirong Dai and Ian McLoughlin},
795
+ title={Acoustic Modeling with Densely Connected Residual Network for Multichannel Speech Recognition},
796
+ year=2018,
797
+ booktitle={Proc. Interspeech 2018},
798
+ pages={1783--1787},
799
+ doi={10.21437/Interspeech.2018-1089},
800
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1089}
801
+ }
802
+
803
+ @article{Kurata2017LanguageMW,
804
+ title={Language modeling with highway LSTM},
805
+ author={Gakuto Kurata and Bhuvana Ramabhadran and George Saon and Abhinav Sethy},
806
+ journal={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
807
+ year={2017},
808
+ pages={244-251}
809
+ }
810
+
811
+ @inproceedings{Saon2017,
812
+ author={George Saon and Gakuto Kurata and Tom Sercu and Kartik Audhkhasi and Samuel Thomas and Dimitrios Dimitriadis and Xiaodong Cui and Bhuvana Ramabhadran and Michael Picheny and Lynn-Li Lim and Bergul Roomi and Phil Hall},
813
+ title={English Conversational Telephone Speech Recognition by Humans and Machines},
814
+ year=2017,
815
+ booktitle={Proc. Interspeech 2017},
816
+ pages={132--136},
817
+ doi={10.21437/Interspeech.2017-405},
818
+ url={http://dx.doi.org/10.21437/Interspeech.2017-405}
819
+ }
820
+
821
+ @inproceedings{Povey+2016,
822
+ author={Daniel Povey and Vijayaditya Peddinti and Daniel Galvez and Pegah Ghahremani and Vimal Manohar and Xingyu Na and Yiming Wang and Sanjeev Khudanpur},
823
+ title={Purely Sequence-Trained Neural Networks for ASR Based on Lattice-Free MMI},
824
+ year=2016,
825
+ booktitle={Interspeech 2016},
826
+ doi={10.21437/Interspeech.2016-595},
827
+ url={http://dx.doi.org/10.21437/Interspeech.2016-595},
828
+ pages={2751--2755}
829
+ }
830
+
831
+ @article{Yang2018,
832
+ author = {Xuerui Yang and
833
+ Jiwei Li and
834
+ Xi Zhou},
835
+ title = {A novel pyramidal-FSMN architecture with lattice-free {MMI} for speech
836
+ recognition},
837
+ journal = {CoRR},
838
+ volume = {abs/1810.11352},
839
+ year = {2018},
840
+ url = {http://arxiv.org/abs/1810.11352},
841
+ archivePrefix = {arXiv},
842
+ eprint = {1810.11352},
843
+ timestamp = {Wed, 31 Oct 2018 14:24:29 +0100},
844
+ biburl = {https://dblp.org/rec/bib/journals/corr/abs-1810-11352},
845
+ bibsource = {dblp computer science bibliography, https://dblp.org}
846
+ }
847
+
848
+ @article{liptchinsky2017based,
849
+ title={Letter-Based Speech Recognition with Gated ConvNets},
850
+ author={Liptchinsky, Vitaliy and Synnaeve, Gabriel and Collobert, Ronan},
851
+ journal={arXiv preprint arXiv:1712.09444},
852
+ year={2017}
853
+ }
854
+
855
+ @inproceedings{Weng2018,
856
+ author={Chao Weng and Jia Cui and Guangsen Wang and Jun Wang and Chengzhu Yu and Dan Su and Dong Yu},
857
+ title={Improving Attention Based Sequence-to-Sequence Models for End-to-End English Conversational Speech Recognition},
858
+ year=2018,
859
+ booktitle={Proc. Interspeech 2018},
860
+ pages={761--765},
861
+ doi={10.21437/Interspeech.2018-1030},
862
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1030}
863
+ }
864
+
865
+ @INPROCEEDINGS{Battenberg2017,
866
+ author={E. {Battenberg} and J. {Chen} and R. {Child} and A. {Coates} and Y. G. Y. {Li} and H. {Liu} and S. {Satheesh} and A. {Sriram} and Z. {Zhu}},
867
+ booktitle={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
868
+ title={Exploring neural transducers for end-to-end speech recognition},
869
+ year={2017},
870
+ volume={},
871
+ number={},
872
+ pages={206-213},
873
+ keywords={recurrent neural nets;speech recognition;Hub500 benchmark;CTC models;speech recognition pipeline;RNN-Transducer models;language model;Seq2Seq models;end-to-end speech recognition;neural transducers;Decoding;Hidden Markov models;Transducers;Task analysis;Speech;Mathematical model;Neural networks},
874
+ doi={10.1109/ASRU.2017.8268937},
875
+ ISSN={},
876
+ month={Dec},
877
+ }
878
+
879
+ @inproceedings{
880
+ loshchilov2018,
881
+ title={Decoupled Weight Decay Regularization},
882
+ author={Ilya Loshchilov and Frank Hutter},
883
+ booktitle={International Conference on Learning Representations},
884
+ year={2019},
885
+ url={https://openreview.net/forum?id=Bkg6RiCqY7},
886
+ }
887
+
888
+ @article{zhang2017ndadam,
889
+ author = {Zijun Zhang and Lin Ma and Zongpeng Li and Chuan Wu},
890
+ title = {Normalized Direction-preserving Adam},
891
+ journal = {arXiv e-prints arXiv:1709.04546},
892
+ year = {2017},
893
+ }
894
+
895
+ @article{park2019,
896
+ author = {{Park}, Daniel S. and {Chan}, William and {Zhang}, Yu and
897
+ {Chiu}, Chung-Cheng and {Zoph}, Barret and {Cubuk}, Ekin D. and
898
+ {Le}, Quoc V.},
899
+ title = "{SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition}",
900
+ journal = {arXiv e-prints},
901
+ year = "2019",
902
+ eid = {arXiv:1904.08779},
903
+ eprint = {1904.08779},
904
+ }
905
+
906
+ @article{novograd2019,
907
+ author = {{Ginsburg}, Boris and {Castonguay}, Patrice and {Hrinchuk}, Oleksii and
908
+ {Kuchaiev}, Oleksii and {Lavrukhin}, Vitaly and {Leary}, Ryan and
909
+ {Li}, Jason and {Nguyen}, Huyen and {Cohen}, Jonathan M.},
910
+ title = "{Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks}",
911
+ journal = {arXiv e-prints},
912
+ year = "2019",
913
+ eid = {arXiv:1905.11286},
914
+ eprint = {1905.11286},
915
+ }
916
+
917
+ @article{kriman2019quartznet,
918
+ title={Quartznet: {Deep} automatic speech recognition with 1d time-channel separable convolutions},
919
+ author={Kriman, Samuel and Beliaev, Stanislav and Ginsburg, Boris and Huang, Jocelyn and Kuchaiev, Oleksii and Lavrukhin, Vitaly and Leary, Ryan and Li, Jason and Zhang, Yang},
920
+ journal={arXiv preprint arXiv:1910.10261},
921
+ year={2019}
922
+ }
923
+
924
+ @misc{itu1988g711,
925
+ title={{ITU-T} {G.711} - {Pulse} code modulation ({PCM}) of voice frequencies},
926
+ author={ITU-T Geneva Switzerland},
927
+ year={1988},
928
+ }
929
+
930
+ @article{han2020contextnet,
931
+ title={ContextNet: Improving convolutional neural networks for automatic speech recognition with global context},
932
+ author={Han, Wei and Zhang, Zhengdong and Zhang, Yu and Yu, Jiahui and Chiu, Chung-Cheng and Qin, James and Gulati, Anmol and Pang, Ruoming and Wu, Yonghui},
933
+ journal={arXiv:2005.03191},
934
+ year={2020}
935
+ }
936
+
937
+ @inproceedings{hu2018squeeze,
938
+ title={Squeeze-and-excitation networks},
939
+ author={Hu, Jie and Shen, Li and Sun, Gang},
940
+ booktitle={ICVPR},
941
+ year={2018}
942
+ }
943
+
944
+ @article{koluguri2020speakernet,
945
+ title={SpeakerNet: 1D Depth-wise Separable Convolutional Network for Text-Independent Speaker Recognition and Verification},
946
+ author={Koluguri, Nithin Rao and Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris},
947
+ journal={arXiv preprint arXiv:2010.12653},
948
+ year={2020}
949
+ }
950
+
951
+ @article{gulati2020conformer,
952
+ title={Conformer: Convolution-augmented transformer for speech recognition},
953
+ author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and others},
954
+ journal={arXiv preprint arXiv:2005.08100},
955
+ year={2020}
956
+ }
957
+
958
+ @article{koluguri2021titanet,
959
+ title={TitaNet: Neural Model for speaker representation with 1D Depth-wise separable convolutions and global context},
960
+ author={Koluguri, Nithin Rao and Park, Taejin and Ginsburg, Boris},
961
+ journal={arXiv preprint arXiv:2110.04410},
962
+ year={2021}
963
+ }
964
+
965
+ @article{Dawalatabad_2021,
966
+ title={ECAPA-TDNN Embeddings for Speaker Diarization},
967
+ url={http://dx.doi.org/10.21437/Interspeech.2021-941},
968
+ DOI={10.21437/interspeech.2021-941},
969
+ journal={Interspeech 2021},
970
+ publisher={ISCA},
971
+ author={Dawalatabad, Nauman and Ravanelli, Mirco and Grondin, François and Thienpondt, Jenthe and Desplanques, Brecht and Na, Hwidong},
972
+ year={2021},
973
+ month={Aug}
974
+ }
975
+
976
+ @article{park2022multi,
977
+ title = {Multi-scale Speaker Diarization with Dynamic Scale Weighting},
978
+ author = {Park, Tae Jin and Koluguri, Nithin Rao and Balam, Jagadeesh and Ginsburg, Boris},
979
+ journal = {https://arxiv.org/abs/2203.15974},
980
+ year = {2022}
981
+ }
982
+
983
+
984
+ @inproceedings{he2019streaming,
985
+ title={Streaming end-to-end speech recognition for mobile devices},
986
+ author={He, Yanzhang and Sainath, Tara N and Prabhavalkar, Rohit and McGraw, Ian and Alvarez, Raziel and Zhao, Ding and Rybach, David and Kannan, Anjuli and Wu, Yonghui and Pang, Ruoming and others},
987
+ booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
988
+ pages={6381--6385},
989
+ year={2019},
990
+ organization={IEEE}
991
+ }
992
+
993
+ @misc{wav2vec2,
994
+ doi = {10.48550/ARXIV.2006.11477},
995
+ url = {https://arxiv.org/abs/2006.11477},
996
+ author = {Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
997
+ title = {wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
998
+ publisher = {arXiv},
999
+ year = {2020},
1000
+ copyright = {arXiv.org perpetual, non-exclusive license}
1001
+ }
1002
+
1003
+ @misc{w2v_bert,
1004
+ doi = {10.48550/ARXIV.2108.06209},
1005
+ url = {https://arxiv.org/abs/2108.06209},
1006
+ author = {Chung, Yu-An and Zhang, Yu and Han, Wei and Chiu, Chung-Cheng and Qin, James and Pang, Ruoming and Wu, Yonghui},
1007
+ title = {W2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training},
1008
+ publisher = {arXiv},
1009
+ year = {2021},
1010
+ copyright = {arXiv.org perpetual, non-exclusive license}
1011
+ }
1012
+
1013
+ @misc{ssl_inter,
1014
+ doi = {10.48550/ARXIV.2112.08778},
1015
+ url = {https://arxiv.org/abs/2112.08778},
1016
+ author = {Wang, Chengyi and Wu, Yu and Chen, Sanyuan and Liu, Shujie and Li, Jinyu and Qian, Yao and Yang, Zhenglu},
1017
+ title = {Self-Supervised Learning for speech recognition with Intermediate layer supervision},
1018
+ publisher = {arXiv},
1019
+ year = {2021},
1020
+ copyright = {arXiv.org perpetual, non-exclusive license}
1021
+ }
1022
+
1023
+ @misc{kim2022squeezeformer,
1024
+ doi = {10.48550/ARXIV.2206.00888},
1025
+ url = {https://arxiv.org/abs/2206.00888},
1026
+ author = {Kim, Sehoon and Gholami, Amir and Shaw, Albert and Lee, Nicholas and Mangalam, Karttikeya and Malik, Jitendra and Mahoney, Michael W. and Keutzer, Kurt},
1027
+ keywords = {Audio and Speech Processing (eess.AS), Computation and Language (cs.CL), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences},
1028
+ title = {Squeezeformer: An Efficient Transformer for Automatic Speech Recognition},
1029
+ publisher = {arXiv},
1030
+ year = {2022},
1031
+ copyright = {arXiv.org perpetual, non-exclusive license}
1032
+ }
1033
+
1034
+ @misc{park2022multi,
1035
+ doi = {10.48550/ARXIV.2203.15974},
1036
+ url = {https://arxiv.org/abs/2203.15974},
1037
+ author = {Park, Tae Jin and Koluguri, Nithin Rao and Balam, Jagadeesh and Ginsburg, Boris},
1038
+ keywords = {Audio and Speech Processing (eess.AS), Computation and Language (cs.CL), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences},
1039
+ title = {Multi-scale Speaker Diarization with Dynamic Scale Weighting},
1040
+ publisher = {arXiv},
1041
+ year = {2022},
1042
+ copyright = {Creative Commons Attribution 4.0 International}
1043
+ }
docs/source/asr/asr_language_modeling.rst ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #####################
2
+ ASR Language Modeling
3
+ #####################
4
+
5
+ Language models have shown to help the accuracy of ASR models. NeMo supports the following two approaches to incorporate language models into the ASR models:
6
+
7
+ * :ref:`ngram_modeling`
8
+ * :ref:`neural_rescoring`
9
+
10
+ It is possible to use both approaches on the same ASR model.
11
+
12
+
13
+ .. _ngram_modeling:
14
+
15
+ ************************
16
+ N-gram Language Modeling
17
+ ************************
18
+
19
+ In this approach, an N-gram LM is trained on text data, then it is used in fusion with beam search decoding to find the
20
+ best candidates. The beam search decoders in NeMo support language models trained with KenLM library (
21
+ `https://github.com/kpu/kenlm <https://github.com/kpu/kenlm>`__).
22
+ The beam search decoders and KenLM library are not installed by default in NeMo, and you need to install them to be
23
+ able to use beam search decoding and N-gram LM.
24
+ Please refer to `scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh` on how to install them.
25
+
26
+ NeMo supports both character-based and BPE-based models for N-gram LMs. An N-gram LM can be used with beam search
27
+ decoders on top of the ASR models to produce more accurate candidates. The beam search decoder would incorporate
28
+ the scores produced by the N-gram LM into its score calculations as the following:
29
+
30
+ .. code-block::
31
+
32
+ final_score = acoustic_score + beam_alpha*lm_score + beam_beta*seq_length
33
+
34
+ where acoustic_score is the score predicted by the acoustic encoder and lm_score is the one estimated by the LM.
35
+ Parameter 'beam_alpha' specifies amount of importance to place on the N-gram language model, and 'beam_beta' is a
36
+ penalty term to consider the sequence length in the scores. Larger alpha means more importance on the LM and less
37
+ importance on the acoustic model. Negative values for beta will give penalty to longer sequences and make the decoder
38
+ to prefer shorter predictions, while positive values would result in longer candidates.
39
+
40
+
41
+ Train N-gram LM
42
+ ===============
43
+
44
+ The script to train an N-gram language model with KenLM can be found at
45
+ `scripts/asr_language_modeling/ngram_lm/train_kenlm.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/train_kenlm.py>`__.
46
+
47
+ This script would train an N-gram language model with KenLM library which can be used with the beam search decoders
48
+ on top of the ASR models. This script supports both character level and BPE level encodings and models which is
49
+ detected automatically from the type of the model.
50
+
51
+
52
+ You may train the N-gram model as the following:
53
+
54
+ .. code-block::
55
+
56
+ python train_kenlm.py --nemo_model_file <path to the .nemo file of the model> \
57
+ --train_file <path to the training text or JSON manifest file \
58
+ --kenlm_bin_path <path to the bin folder of KenLM library> \
59
+ --kenlm_model_file <path to store the binary KenLM model> \
60
+ --ngram_length <order of N-gram model> \
61
+ --preserve_arpa
62
+
63
+ The train file specified by `--train_file` can be a text file or JSON manifest. If the file's extension is anything
64
+ other than `.json`, it assumes that data format is plain text. For plain text format, each line should contain one
65
+ sample. For JSON manifest file, the file need to contain json formatted samples per each line like this:
66
+
67
+ .. code-block::
68
+
69
+ {"audio_filepath": "/data_path/file1.wav", "text": "The transcript of the audio file."}
70
+
71
+ It just extracts the `text` field from each line to create the training text file. After the N-gram model is trained,
72
+ it is stored at the path specified by `--kenlm_model_file`.
73
+
74
+ The following is the list of the arguments for the training script:
75
+
76
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
77
+ | **Argument** | **Type** | **Default** | **Description** |
78
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
79
+ | nemo_model_file | str | Required | The path of the `.nemo` file of the ASR model. It is needed to extract the tokenizer. |
80
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
81
+ | train_file | str | Required | Path to the training file, it can be a text file or JSON manifest. |
82
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
83
+ | kenlm_model_file | str | Required | The path to store the KenLM binary model file. |
84
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
85
+ | kenlm_bin_path | str | Required | The path to the bin folder of KenLM. It is a folder named `bin` under where KenLM is installed. |
86
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
87
+ | ngram_length** | int | Required | Specifies order of N-gram LM. |
88
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
89
+ | do_lower_case | bool | ``False`` | Whether to make the training text all lower case. |
90
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
91
+ | preserve_arpa | bool | ``False`` | Whether to preserve the intermediate ARPA file after construction of the BIN file. |
92
+ +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+
93
+
94
+ ** Note: Recommend to use 6 as the order of the N-gram model for BPE-based models. Higher orders may need the re-compilation of KenLM to support it.
95
+
96
+ Evaluate by Beam Search Decoding and N-gram LM
97
+ ==============================================
98
+
99
+ NeMo's beam search decoders are capable of using the KenLM's N-gram models to find the best candidates.
100
+ The script to evaluate an ASR model with beam search decoding and N-gram models can be found at
101
+ `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py>`__.
102
+
103
+ This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_beamsearch_ngram.py --help`` to see the full list of arguments.
104
+
105
+ You may evaluate an ASR model as the following:
106
+
107
+ .. code-block::
108
+
109
+ python eval_beamsearch_ngram.py nemo_model_file=<path to the .nemo file of the model> \
110
+ input_manifest=<path to the evaluation JSON manifest file \
111
+ kenlm_model_file=<path to the binary KenLM model> \
112
+ beam_width=[<list of the beam widths, separated with commas>] \
113
+ beam_alpha=[<list of the beam alphas, separated with commas>] \
114
+ beam_beta=[<list of the beam betas, separated with commas>] \
115
+ preds_output_folder=<optional folder to store the predictions> \
116
+ probs_cache_file=null \
117
+ decoding_mode=beamsearch_ngram \
118
+ decoding_strategy="<Beam library such as beam, pyctcdecode or flashlight>"
119
+
120
+ It can evaluate a model in the three following modes by setting the argument `--decoding_mode`:
121
+
122
+ * greedy: Just greedy decoding is done, and no beam search decoding is performed.
123
+ * beamsearch: The beam search decoding is done but without using the N-gram language model, final results would be equivalent to setting the weight of LM (beam_beta) to zero.
124
+ * beamsearch_ngram: The beam search decoding is done with N-gram LM.
125
+
126
+ The `beamsearch` mode would evaluate by beam search decoding without any language model.
127
+ It would report the performances in terms of Word Error Rate (WER) and Character Error Rate (CER). Moreover,
128
+ the WER/CER of the model when the best candidate is selected among the candidates is also reported as the best WER/CER.
129
+ It can be an indicator of how good the predicted candidates are.
130
+
131
+ The script would initially load the ASR model and predict the outputs of the model's encoder as log probabilities.
132
+ This part would be computed in batches on a device selected by `--device`, which can be CPU (`--device=cpu`) or a
133
+ single GPU (`--device=cuda:0`). The batch size of this part can get specified by `--acoustic_batch_size`. You may use
134
+ the largest batch size feasible to speed up the step of calculating the log probabilities. You may also use `--use_amp`
135
+ to speed up the calculation of log probabilities and make it possible to use larger sizes for `--acoustic_batch_size`.
136
+ Currently multi-GPU is not supported for calculating the log probabilities, but using `--probs_cache_file` can help.
137
+ It stores the log probabilities produced from the model's encoder into a pickle file so that next time the first step
138
+ can get skipped.
139
+
140
+ The following is the list of the important arguments for the evaluation script:
141
+
142
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
143
+ | **Argument** | **Type** | **Default** | **Description** |
144
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
145
+ | nemo_model_file | str | Required | The path of the `.nemo` file of the ASR model to extract the tokenizer. |
146
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
147
+ | input_manifest | str | Required | Path to the training file, it can be a text file or JSON manifest. |
148
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
149
+ | kenlm_model_file | str | Required | The path to store the KenLM binary model file. |
150
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
151
+ | preds_output_folder | str | None | The path to an optional folder to store the predictions. |
152
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
153
+ | probs_cache_file | str | None | The cache file for storing the outputs of the model. |
154
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
155
+ | acoustic_batch_size | int | 16 | The batch size to calculate log probabilities. |
156
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
157
+ | use_amp | bool | False | Whether to use AMP if available to calculate log probabilities. |
158
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
159
+ | device | str | cuda | The device to load the model onto to calculate log probabilities. |
160
+ | | | | It can `cpu`, `cuda`, `cuda:0`, `cuda:1`, ... |
161
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
162
+ | decoding_mode | str | beamsearch_ngram | The decoding scheme to be used for evaluation. |
163
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
164
+ | beam_width | float | Required | List of the width or list of the widths of the beam search decoding. |
165
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
166
+ | beam_alpha | float | Required | List of the alpha parameter for the beam search decoding. |
167
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
168
+ | beam_beta | float | Required | List of the beta parameter for the beam search decoding. |
169
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
170
+ | beam_batch_size | int | 128 | The batch size to be used for beam search decoding. |
171
+ | | | | Larger batch size can be a little faster, but uses larger memory. |
172
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
173
+ | decoding_strategy | str | beam | String argument for type of decoding strategy for the model. |
174
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
175
+ | decoding | Dict | BeamCTC | Subdict of beam search configs. Values found via |
176
+ | | Config | InferConfig | python eval_beamsearch_ngram.py --help |
177
+ +---------------------+----------+------------------+-------------------------------------------------------------------------+
178
+
179
+ Width of the beam search (`--beam_width`) specifies the number of top candidates/predictions the beam search decoder
180
+ would search for. Larger beams result in more accurate but slower predictions.
181
+
182
+ .. note::
183
+
184
+ The ``eval_beamsearch_ngram.py`` script contains the entire subconfig used for CTC Beam Decoding.
185
+ Therefore it is possible to forward arguments for various beam search libraries such as ``flashlight``
186
+ and ``pyctcdecode`` via the ``decoding`` subconfig.
187
+
188
+ There is also a tutorial to learn more about evaluating the ASR models with N-gram LM here:
189
+ `Offline ASR Inference with Beam Search and External Language Model Rescoring <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Offline_ASR.ipynb>`_
190
+
191
+ Beam Search Engines
192
+ -------------------
193
+
194
+ NeMo ASR CTC supports multiple beam search engines for decoding. The default engine is ``beam`` which is the OpenSeq2Seq
195
+ decoding library.
196
+
197
+ OpenSeq2Seq (``beam``)
198
+ ~~~~~~~~~~~~~~~~~~~~~~
199
+
200
+ CPU-based beam search engine that is quite efficient and supports char and subword models. It requires a character/subword
201
+ KenLM model to be provided.
202
+
203
+ The config for this decoding library is described above.
204
+
205
+ Flashlight (``flashlight``)
206
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
207
+
208
+ Flashlight is a C++ library for ASR decoding provided at `https://github.com/flashlight/flashlight <https://github.com/flashlight/flashlight>`_. It is a CPU and CUDA-based beam search engine that is quite efficient and supports
209
+ char and subword models. It an ARPA KenLM file.
210
+
211
+ It supports several advanced features such as lexicon based / lexicon free decoding, beam pruning threshold, and more.
212
+
213
+ .. code-block:: python
214
+
215
+ @dataclass
216
+ class FlashlightConfig:
217
+ lexicon_path: Optional[str] = None
218
+ beam_size_token: int = 16
219
+ beam_threshold: float = 20.0
220
+ unk_weight: float = -math.inf
221
+ sil_weight: float = 0.0
222
+ unit_lm: bool = False
223
+
224
+ .. code-block::
225
+
226
+ # Lexicon-based decoding
227
+ python eval_beamsearch_ngram.py ... \
228
+ decoding_strategy="flashlight" \
229
+ decoding.beam.flashlight_cfg.lexicon_path='/path/to/lexicon.lexicon' \
230
+ decoding.beam.flashlight_cfg.beam_size_token = 32 \
231
+ decoding.beam.flashlight_cfg.beam_threshold = 25.0
232
+
233
+ # Lexicon-free decoding
234
+ python eval_beamsearch_ngram.py ... \
235
+ decoding_strategy="flashlight" \
236
+ decoding.beam.flashlight_cfg.beam_size_token = 32 \
237
+ decoding.beam.flashlight_cfg.beam_threshold = 25.0
238
+
239
+
240
+ PyCTCDecode (``pyctcdecode``)
241
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
242
+
243
+ PyCTCDecode is a Python library for ASR decoding provided at `https://github.com/kensho-technologies/pyctcdecode <https://github.com/kensho-technologies/pyctcdecode>`_. It is a CPU-based beam search engine that is somewhat efficient for a pure python library, and supports char and subword models. It requires a character/subword KenLM ARPA / BINARY model to be provided.
244
+
245
+ It has advanced features such as word boosting which can be useful for transcript customization.
246
+
247
+ .. code-block:: python
248
+
249
+ @dataclass
250
+ class PyCTCDecodeConfig:
251
+ beam_prune_logp: float = -10.0
252
+ token_min_logp: float = -5.0
253
+ prune_history: bool = False
254
+ hotwords: Optional[List[str]] = None
255
+ hotword_weight: float = 10.0
256
+
257
+ .. code-block::
258
+
259
+ # PyCTCDecoding
260
+ python eval_beamsearch_ngram.py ... \
261
+ decoding_strategy="pyctcdecode" \
262
+ decoding.beam.pyctcdecode_cfg.beam_prune_logp = -10. \
263
+ decoding.beam.pyctcdecode_cfg.token_min_logp = -5. \
264
+ decoding.beam.pyctcdecode_cfg.hotwords=[<List of str words>] \
265
+ decoding.beam.pyctcdecode_cfg.hotword_weight=10.0
266
+
267
+
268
+ Hyperparameter Grid Search
269
+ --------------------------
270
+
271
+ Beam search decoding with N-gram LM has three main hyperparameters: `beam_width`, `beam_alpha`, and `beam_beta`.
272
+ The accuracy of the model is dependent to the values of these parameters, specially beam_alpha and beam_beta.
273
+ You may specify a single or list of values for each of these parameters to perform grid search. It would perform the
274
+ beam search decoding on all the combinations of the these three hyperparameters.
275
+ For instance, the following set of parameters would results in 2*1*2=4 beam search decodings:
276
+
277
+ .. code-block::
278
+
279
+ python eval_beamsearch_ngram.py ... \
280
+ beam_width=[64,128] \
281
+ beam_alpha=[1.0] \
282
+ beam_beta=[1.0,0.5]
283
+
284
+
285
+ .. _neural_rescoring:
286
+
287
+ ****************
288
+ Neural Rescoring
289
+ ****************
290
+
291
+ In this approach a neural network is used which can gives scores to a candidate. A candidate is the text transcript predicted by the decoder of the ASR model.
292
+ The top K candidates produced by the beam search decoding (beam width of K) are given to a neural language model to rank them.
293
+ Ranking can be done by a language model which gives a score to each candidate.
294
+ This score is usually combined with the scores from the beam search decoding to produce the final scores and rankings.
295
+
296
+ Train Neural Rescorer
297
+ =====================
298
+
299
+ An example script to train such a language model with Transformer can be found at `examples/nlp/language_modeling/transformer_lm.py <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/language_modeling/transformer_lm.py>`__.
300
+ It trains a ``TransformerLMModel`` which can be used as a neural rescorer for an ASR system. Full documentation on language models training is available at:
301
+
302
+ :doc:`../nlp/language_modeling`
303
+
304
+ You may also use a pretrained language model from HuggingFace library like Transformer-XL and GPT instead of training your model.
305
+ Models like BERT and RoBERTa are not supported by this script as they are trained as a Masked Language Model and are not efficient and effective to score sentences out of the box.
306
+
307
+
308
+ Evaluation
309
+ ==========
310
+
311
+ Given a trained TransformerLMModel `.nemo` file or a pretrained HF model, the script available at
312
+ `scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py>`__
313
+ can be used to re-score beams obtained with ASR model. You need the `.tsv` file containing the candidates produced
314
+ by the acoustic model and the beam search decoding to use this script. The candidates can be the result of just the beam
315
+ search decoding or the result of fusion with an N-gram LM. You may generate this file by specifying `--preds_output_folder' for
316
+ `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py>`__.
317
+
318
+ The neural rescorer would rescore the beams/candidates by using two parameters of `rescorer_alpha` and `rescorer_beta` as the following:
319
+
320
+ .. code-block::
321
+
322
+ final_score = beam_search_score + rescorer_alpha*neural_rescorer_score + rescorer_beta*seq_length
323
+
324
+ Parameter `rescorer_alpha` specifies amount of importance to place on the neural rescorer model, and `rescorer_beta` is
325
+ a penalty term to consider the sequence length in the scores. They have similar effects like the parameters
326
+ `beam_alpha` and `beam_beta` of beam search decoder and N-gram LM.
327
+
328
+ You may follow the following steps to evaluate a neural LM:
329
+
330
+ #. Obtain `.tsv` file with beams and their corresponding scores. Scores can be from a regular beam search decoder or
331
+ in fusion with an N-gram LM scores. For a given beam size `beam_size` and a number of examples
332
+ for evaluation `num_eval_examples`, it should contain (`num_eval_examples` x `beam_size`) lines of
333
+ form `beam_candidate_text \t score`. This file can be generated by `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py>`__
334
+
335
+ #. Rescore the candidates by `scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py <https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py>`__.
336
+
337
+ .. code-block::
338
+
339
+ python eval_neural_rescorer.py
340
+ --lm_model=[path to .nemo file of the LM or the name of a HF pretrained model]
341
+ --beams_file=[path to beams .tsv file]
342
+ --beam_size=[size of the beams]
343
+ --eval_manifest=[path to eval manifest .json file]
344
+ --batch_size=[batch size used for inference on the LM model]
345
+ --alpha=[the value for the parameter rescorer_alpha]
346
+ --beta=[the value for the parameter rescorer_beta]
347
+ --scores_output_file=[the optional path to store the rescored candidates]
348
+
349
+ The candidates along with their new scores would be stored at the file specified by `--scores_output_file`.
350
+
351
+ The following is the list of the arguments for the evaluation script:
352
+
353
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
354
+ | **Argument** |**Type**| **Default** | **Description** |
355
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
356
+ | lm_model | str | Required | The path of the '.nemo' file of an ASR model, or the name of a |
357
+ | | | | HuggingFace pretrained model like 'transfo-xl-wt103' or 'gpt2' |
358
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
359
+ | eval_manifest | str | Required | Path to the evaluation manifest file (.json manifest file) |
360
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
361
+ | beams_file | str | Required | path to beams file (.tsv) containing the candidates and their scores |
362
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
363
+ | beam_size | int | Required | The width of the beams (number of candidates) generated by the decoder |
364
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
365
+ | alpha | float | None | The value for parameter rescorer_alpha |
366
+ | | | | Not passing value would enable linear search for rescorer_alpha |
367
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
368
+ | beta | float | None | The value for parameter rescorer_beta |
369
+ | | | | Not passing value would enable linear search for rescorer_beta |
370
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
371
+ | batch_size | int | 16 | The batch size used to calculate the scores |
372
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
373
+ | max_seq_length | int | 512 | Maximum sequence length (in tokens) for the input |
374
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
375
+ | scores_output_file | str | None | The optional file to store the rescored beams |
376
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
377
+ | use_amp | bool | ``False`` | Whether to use AMP if available calculate the scores |
378
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
379
+ | device | str | cuda | The device to load LM model onto to calculate the scores |
380
+ | | | | It can be 'cpu', 'cuda', 'cuda:0', 'cuda:1', ... |
381
+ +---------------------+--------+------------------+-------------------------------------------------------------------------+
382
+
383
+
384
+ Hyperparameter Linear Search
385
+ ----------------------------
386
+
387
+ This script also supports linear search for parameters `alpha` and `beta`. If any of the two is not
388
+ provided, a linear search is performed to find the best value for that parameter. When linear search is used, initially
389
+ `beta` is set to zero and the best value for `alpha` is found, then `alpha` is fixed with
390
+ that value and another linear search is done to find the best value for `beta`.
391
+ If any of the of these two parameters is already specified, then search for that one is skipped. After each search for a
392
+ parameter, the plot of WER% for different values of the parameter is also shown.
393
+
394
+ It is recommended to first use the linear search for both parameters on a validation set by not providing any values for `--alpha` and `--beta`.
395
+ Then check the WER curves and decide on the best values for each parameter. Finally, evaluate the best values on the test set.
docs/source/asr/configs.rst ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NeMo ASR Configuration Files
2
+ ============================
3
+
4
+ This section describes the NeMo configuration file setup that is specific to models in the ASR collection. For general information
5
+ about how to set up and run experiments that is common to all NeMo models (e.g. Experiment Manager and PyTorch Lightning trainer
6
+ parameters), see the :doc:`../core/core` section.
7
+
8
+ The model section of the NeMo ASR configuration files generally requires information about the dataset(s) being used, the preprocessor
9
+ for audio files, parameters for any augmentation being performed, as well as the model architecture specification. The sections on
10
+ this page cover each of these in more detail.
11
+
12
+ Example configuration files for all of the NeMo ASR scripts can be found in the
13
+ `config directory of the examples <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/conf>`_.
14
+
15
+
16
+ Dataset Configuration
17
+ ---------------------
18
+
19
+ Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and
20
+ ``test_ds`` sections in the configuration file, respectively. Depending on the task, there may be arguments specifying the sample rate
21
+ of the audio files, the vocabulary of the dataset (for character prediction), whether or not to shuffle the dataset, and so on. You may
22
+ also decide to leave fields such as the ``manifest_filepath`` blank, to be specified via the command-line at runtime.
23
+
24
+ Any initialization parameter that is accepted for the Dataset class used in the experiment can be set in the config file.
25
+ Refer to the `Datasets <./api.html#Datasets>`__ section of the API for a list of Datasets and their respective parameters.
26
+
27
+ An example ASR train and validation configuration should look similar to the following:
28
+
29
+ .. code-block:: yaml
30
+
31
+ # Specified at the beginning of the config file
32
+ labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
33
+ "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
34
+
35
+ model:
36
+ train_ds:
37
+ manifest_filepath: ???
38
+ sample_rate: 16000
39
+ labels: *labels # Uses the labels above
40
+ batch_size: 32
41
+ trim_silence: True
42
+ max_duration: 16.7
43
+ shuffle: True
44
+ num_workers: 8
45
+ pin_memory: true
46
+ # tarred datasets
47
+ is_tarred: false # If set to true, uses the tarred version of the Dataset
48
+ tarred_audio_filepaths: null # Not used if is_tarred is false
49
+ shuffle_n: 2048 # Not used if is_tarred is false
50
+ # bucketing params
51
+ bucketing_strategy: "synced_randomized"
52
+ bucketing_batch_size: null
53
+ bucketing_weights: null
54
+
55
+ validation_ds:
56
+ manifest_filepath: ???
57
+ sample_rate: 16000
58
+ labels: *labels # Uses the labels above
59
+ batch_size: 32
60
+ shuffle: False # No need to shuffle the validation data
61
+ num_workers: 8
62
+ pin_memory: true
63
+
64
+ By default, dataloaders are set up when the model is instantiated. However, dataloader setup can be deferred to
65
+ model's `setup()` method by setting ``defer_setup`` in the configuration.
66
+
67
+ For example, training data setup can be deferred as follows:
68
+
69
+ .. code-block:: yaml
70
+
71
+ model:
72
+ train_ds:
73
+ # Configure training data as usual
74
+ ...
75
+ # Defer train dataloader setup from `__init__` to `setup`
76
+ defer_setup: true
77
+
78
+
79
+ Preprocessor Configuration
80
+ --------------------------
81
+
82
+ If you are loading audio files for your experiment, you will likely want to use a preprocessor to convert from the
83
+ raw audio signal to features (e.g. mel-spectrogram or MFCC). The ``preprocessor`` section of the config specifies the audio
84
+ preprocessor to be used via the ``_target_`` field, as well as any initialization parameters for that preprocessor.
85
+
86
+ An example of specifying a preprocessor is as follows:
87
+
88
+ .. code-block:: yaml
89
+
90
+ model:
91
+ ...
92
+ preprocessor:
93
+ # _target_ is the audio preprocessor module you want to use
94
+ _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
95
+ normalize: "per_feature"
96
+ window_size: 0.02
97
+ ...
98
+ # Other parameters for the preprocessor
99
+
100
+ Refer to the `Audio Preprocessors <./api.html#Audio Preprocessors>`__ API section for the preprocessor options, expected arguments,
101
+ and defaults.
102
+
103
+ Augmentation Configurations
104
+ ---------------------------
105
+
106
+ There are a few on-the-fly spectrogram augmentation options for NeMo ASR, which can be specified by the
107
+ configuration file using a ``spec_augment`` section.
108
+
109
+ For example, there are options for `Cutout <https://arxiv.org/abs/1708.04552>`_ and
110
+ `SpecAugment <https://arxiv.org/abs/1904.08779>`_ available via the ``SpectrogramAugmentation`` module.
111
+
112
+ The following example sets up both ``Cutout`` (via the ``rect_*`` parameters) and ``SpecAugment`` (via the ``freq_*``
113
+ and ``time_*`` parameters).
114
+
115
+ .. code-block:: yaml
116
+
117
+ model:
118
+ ...
119
+ spec_augment:
120
+ _target_: nemo.collections.asr.modules.SpectrogramAugmentation
121
+ # Cutout parameters
122
+ rect_masks: 5 # Number of rectangles to cut from any given spectrogram
123
+ rect_freq: 50 # Max cut of size 50 along the frequency dimension
124
+ rect_time: 120 # Max cut of size 120 along the time dimension
125
+ # SpecAugment parameters
126
+ freq_masks: 2 # Cut two frequency bands
127
+ freq_width: 15 # ... of width 15 at maximum
128
+ time_masks: 5 # Cut out 10 time bands
129
+ time_width: 25 # ... of width 25 at maximum
130
+
131
+ You can use any combination of ``Cutout``, frequency/time ``SpecAugment``, or neither of them.
132
+
133
+ With NeMo ASR, you can also add augmentation pipelines that can be used to simulate various kinds of noise
134
+ added to audio in the channel. Augmentors in a pipeline are applied on the audio data read in the data layer. Online
135
+ augmentors can be specified in the config file using an ``augmentor`` section in ``train_ds``. The following example
136
+ adds an augmentation pipeline that first adds white noise to an audio sample with a probability of 0.5 and at a level
137
+ randomly picked between -50 dB and -10 dB and then passes the resultant samples through a room impulse response randomly
138
+ picked from the manifest file provided for ``impulse`` augmentation in the config file.
139
+
140
+ .. code-block:: yaml
141
+
142
+ model:
143
+ ...
144
+ train_ds:
145
+ ...
146
+ augmentor:
147
+ white_noise:
148
+ prob: 0.5
149
+ min_level: -50
150
+ max_level: -10
151
+ impulse:
152
+ prob: 0.3
153
+ manifest_path: /path/to/impulse_manifest.json
154
+
155
+ Refer to the `Audio Augmentors <./api.html#Audio Augmentors>`__ API section for more details.
156
+
157
+ Tokenizer Configurations
158
+ ------------------------
159
+
160
+ Some models utilize sub-word encoding via an external tokenizer instead of explicitly defining their vocabulary.
161
+
162
+ For such models, a ``tokenizer`` section is added to the model config. ASR models currently support two types of
163
+ custom tokenizers:
164
+
165
+ - Google Sentencepiece tokenizers (tokenizer type of ``bpe`` in the config)
166
+ - HuggingFace WordPiece tokenizers (tokenizer type of ``wpe`` in the config)
167
+ - Aggregate tokenizers ((tokenizer type of ``agg`` in the config), see below)
168
+
169
+ In order to build custom tokenizers, refer to the ``ASR_with_Subword_Tokenization`` notebook available in the
170
+ ASR tutorials directory.
171
+
172
+ The following example sets up a ``SentencePiece Tokenizer`` at a path specified by the user:
173
+
174
+ .. code-block:: yaml
175
+
176
+ model:
177
+ ...
178
+ tokenizer:
179
+ dir: "<path to the directory that contains the custom tokenizer files>"
180
+ type: "bpe" # can be "bpe" or "wpe"
181
+
182
+ The Aggregate (``agg``) tokenizer feature makes it possible to combine tokenizers in order to train multilingual
183
+ models. The config file would look like this:
184
+
185
+ .. code-block:: yaml
186
+
187
+ model:
188
+ ...
189
+ tokenizer:
190
+ type: "agg" # aggregate tokenizer
191
+ langs:
192
+ en:
193
+ dir: "<path to the directory that contains the tokenizer files>"
194
+ type: "bpe" # can be "bpe" or "wpe"
195
+ es:
196
+ dir: "<path to the directory that contains the tokenizer files>"
197
+ type: "bpe" # can be "bpe" or "wpe"
198
+
199
+ In the above config file, each language is associated with its own pre-trained tokenizer, which gets assigned
200
+ a token id range in the order the tokenizers are listed. To train a multilingual model, one needs to populate the
201
+ ``lang`` field in the manifest file, allowing the routing of each sample to the correct tokenizer. At inference time,
202
+ the routing is done based on the inferred token id range.
203
+
204
+ For models which utilize sub-word tokenization, we share the decoder module (``ConvASRDecoder``) with character tokenization models.
205
+ All parameters are shared, but for models which utilize sub-word encoding, there are minor differences when setting up the config. For
206
+ such models, the tokenizer is utilized to fill in the missing information when the model is constructed automatically.
207
+
208
+ For example, a decoder config corresponding to a sub-word tokenization model should look similar to the following:
209
+
210
+ .. code-block:: yaml
211
+
212
+ model:
213
+ ...
214
+ decoder:
215
+ _target_: nemo.collections.asr.modules.ConvASRDecoder
216
+ feat_in: *enc_final
217
+ num_classes: -1 # filled with vocabulary size from tokenizer at runtime
218
+ vocabulary: [] # filled with vocabulary from tokenizer at runtime
219
+
220
+
221
+ Model Architecture Configurations
222
+ ---------------------------------
223
+
224
+ Each configuration file should describe the model architecture being used for the experiment. Models in the NeMo ASR collection need
225
+ an ``encoder`` section and a ``decoder`` section, with the ``_target_`` field specifying the module to use for each.
226
+
227
+ Here is the list of the parameters in the model section which are shared among most of the ASR models:
228
+
229
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
230
+ | **Parameter** | **Datatype** | **Description** | **Supported Values** |
231
+ +=========================+==================+===============================================================================================================+=================================+
232
+ | :code:`log_prediction` | bool | Whether a random sample should be printed in the output at each step, along with its predicted transcript. | |
233
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
234
+ | :code:`ctc_reduction` | string | Specifies the reduction type of CTC loss. Defaults to ``mean_batch`` which would take the average over the | :code:`none`, |
235
+ | | | batch after taking the average over the length of each sample. | :code:`mean_batch` |
236
+ | | | | :code:`mean`, :code:`sum` |
237
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
238
+
239
+ The following sections go into more detail about the specific configurations of each model architecture.
240
+
241
+ For more information about the ASR models, refer to the :doc:`Models <./models>` section.
242
+
243
+ Jasper and QuartzNet
244
+ ~~~~~~~~~~~~~~~~~~~~
245
+
246
+ The `Jasper <./models.html#Jasper>`__ and `QuartzNet <./models.html#QuartzNet>`__ models are very similar, and as such the components in their
247
+ configs are very similar as well.
248
+
249
+ Both architectures use the ``ConvASREncoder`` for the ``encoder``, with parameters detailed in the table below. The encoder parameters
250
+ include details about the Jasper/QuartzNet ``[BxR]`` encoder architecture, including how many blocks to use (``B``), how many times
251
+ to repeat each sub-block (``R``), and the convolution parameters for each block.
252
+
253
+ The number of blocks ``B`` is determined by the number of list elements under ``jasper`` minus the one prologue and two epilogue blocks.
254
+ The number of sub-blocks ``R`` is determined by setting the ``repeat`` parameter.
255
+
256
+ To use QuartzNet (which uses more compact time-channel separable convolutions) instead of Jasper, add :code:`separable: true` to all
257
+ but the last block in the architecture.
258
+
259
+ Change the parameter name ``jasper``.
260
+
261
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+
262
+ | **Parameter** | **Datatype** | **Description** | **Supported Values** |
263
+ +=========================+==================+===============================================================================================================+=====================================+
264
+ | :code:`feat_in` | int | The number of input features. Should be equal to :code:`features` in the preprocessor parameters. | |
265
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+
266
+ | :code:`activation` | string | Which activation function to use in the encoder. | :code:`hardtanh`, :code:`relu`, |
267
+ | | | | :code:`selu`, :code:`swish` |
268
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+
269
+ | :code:`conv_mask` | bool | Whether to use masked convolutions in the encoder. Defaults to ``true``. | |
270
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+
271
+ | :code:`jasper` | | A list of blocks that specifies your encoder architecture. Each entry in this list represents one block in | |
272
+ | | | the architecture and contains the parameters for that block, including convolution parameters, dropout, and | |
273
+ | | | the number of times the block is repeated. Refer to the `Jasper <https://arxiv.org/pdf/1904.03288.pdf>`_ and | |
274
+ | | | `QuartzNet <https://arxiv.org/pdf/1910.10261.pdf>`_ papers for details about specific model configurations. | |
275
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+
276
+
277
+ A QuartzNet 15x5 (fifteen blocks, each sub-block repeated five times) encoder configuration should look similar to the following example:
278
+
279
+ .. code-block:: yaml
280
+
281
+ # Specified at the beginning of the file for convenience
282
+ n_mels: &n_mels 64 # Used for both the preprocessor and encoder as number of input features
283
+ repeat: &repeat 5 # R=5
284
+ dropout: &dropout 0.0
285
+ separable: &separable true # Set to true for QN. Set to false for Jasper.
286
+
287
+ model:
288
+ ...
289
+ encoder:
290
+ _target_: nemo.collections.asr.modules.ConvASREncoder
291
+ feat_in: *n_mels # Should match "features" in the preprocessor.
292
+ activation: relu
293
+ conv_mask: true
294
+
295
+ jasper: # This field name should be "jasper" for both types of models.
296
+
297
+ # Prologue block
298
+ - dilation: [1]
299
+ dropout: *dropout
300
+ filters: 256
301
+ kernel: [33]
302
+ repeat: 1 # Prologue block is not repeated.
303
+ residual: false
304
+ separable: *separable
305
+ stride: [2]
306
+
307
+ # Block 1
308
+ - dilation: [1]
309
+ dropout: *dropout
310
+ filters: 256
311
+ kernel: [33]
312
+ repeat: *repeat
313
+ residual: true
314
+ separable: *separable
315
+ stride: [1]
316
+
317
+ ... # Entries for blocks 2~14
318
+
319
+ # Block 15
320
+ - dilation: [1]
321
+ dropout: *dropout
322
+ filters: 512
323
+ kernel: [75]
324
+ repeat: *repeat
325
+ residual: true
326
+ separable: *separable
327
+ stride: [1]
328
+
329
+ # Two epilogue blocks
330
+ - dilation: [2]
331
+ dropout: *dropout
332
+ filters: 512
333
+ kernel: [87]
334
+ repeat: 1 # Epilogue blocks are not repeated
335
+ residual: false
336
+ separable: *separable
337
+ stride: [1]
338
+
339
+ - dilation: [1]
340
+ dropout: *dropout
341
+ filters: &enc_filters 1024
342
+ kernel: [1]
343
+ repeat: 1 # Epilogue blocks are not repeated
344
+ residual: false
345
+ stride: [1]
346
+
347
+ Both Jasper and QuartzNet use the ``ConvASRDecoder`` as the decoder. The decoder parameters are detailed in the following table.
348
+
349
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
350
+ | **Parameter** | **Datatype** | **Description** | **Supported Values** |
351
+ +=========================+==================+===============================================================================================================+=================================+
352
+ | :code:`feat_in` | int | The number of input features to the decoder. Should be equal to the number of filters in the last block of | |
353
+ | | | the encoder. | |
354
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
355
+ | :code:`vocabulary` | list | A list of the valid output characters for your model. For example, for an English dataset, this could be a | |
356
+ | | | list of all lowercase letters, space, and apostrophe. | |
357
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
358
+ | :code:`num_classes` | int | Number of output classes, i.e. the length of :code:`vocabulary`. | |
359
+ +-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+
360
+
361
+ For example, a decoder config corresponding to the encoder above should look similar to the following:
362
+
363
+ .. code-block:: yaml
364
+
365
+ model:
366
+ ...
367
+ decoder:
368
+ _target_: nemo.collections.asr.modules.ConvASRDecoder
369
+ feat_in: *enc_filters
370
+ vocabulary: *labels
371
+ num_classes: 28 # Length of the vocabulary list
372
+
373
+ Citrinet
374
+ ~~~~~~~~
375
+
376
+ The `Citrinet <./models.html#Citrinet>`__ and `QuartzNet <./models.html#QuartzNet>`__ models are very similar, and as such the
377
+ components in their configs are very similar as well. Citrinet utilizes Squeeze and Excitation, as well as sub-word tokenization, in
378
+ contrast to QuartzNet. Depending on the dataset, we utilize different tokenizers. For Librispeech, we utilize the HuggingFace WordPiece
379
+ tokenizer, and for all other datasets we utilize the Google Sentencepiece tokenizer - usually the ``unigram`` tokenizer type.
380
+
381
+ Both architectures use the ``ConvASREncoder`` for the ``encoder``, with parameters detailed above. The encoder parameters include
382
+ details about the Citrinet-C encoder architecture, including how many filters are used per channel (``C``). The Citrinet-C
383
+ configuration is a shortform notation for Citrinet-21x5xC, such that ``B = 21`` and ``R = 5`` are the default and should generally
384
+ not be changed.
385
+
386
+ To use Citrinet instead of QuartzNet, refer to the ``citrinet_512.yaml`` configuration found inside the ``examples/asr/conf/citrinet``
387
+ directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.submodules.jasper.JasperBlock` as ``Jasper`` or
388
+ ``QuartzNet``.
389
+
390
+ While the configs for Citrinet and QuartzNet are similar, we note the additional flags used for Citrinet below. Refer to the
391
+ ``JasperBlock`` documentation for the meaning of these arguments.
392
+
393
+ +---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+
394
+ | **Parameter** | **Datatype** | **Description** | **Supported Values** |
395
+ +===========================+==================+===========================================================================================================+===================================+
396
+ | :code:`se` | bool | Whether to apply squeeze-and-excitation mechanism or not. | :code:`true` or :code:`false` |
397
+ +---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+
398
+ | :code:`se_context_size` | int | SE context size. -1 means global context. | :code:`-1` or :code:`+ve int` |
399
+ +---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+
400
+ | :code:`stride_last` | bool | Stride on the final repeated block or all repeated blocks. | :code:`true` or :code:`false` |
401
+ +---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+
402
+ | :code:`residual_mode` | str | Type of residual branch to construct. | :code:`"add"` or |
403
+ | | | Can be pointwise residual addition or pointwise strided residual attention | :code:`"stride_add"` |
404
+ +---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+
405
+
406
+ A Citrinet-512 config should look similar to the following:
407
+
408
+ .. code-block:: yaml
409
+
410
+ model:
411
+ ...
412
+ # Specify some defaults across the entire model
413
+ model_defaults:
414
+ repeat: 5
415
+ dropout: 0.1
416
+ separable: true
417
+ se: true
418
+ se_context_size: -1
419
+ ...
420
+ encoder:
421
+ _target_: nemo.collections.asr.modules.ConvASREncoder
422
+ feat_in: *n_mels # Should match "features" in the preprocessor.
423
+ activation: relu
424
+ conv_mask: true
425
+
426
+ jasper: # This field name should be "jasper" for the JasperBlock (which constructs Citrinet).
427
+
428
+ # Prologue block
429
+ - filters: 512
430
+ repeat: 1
431
+ kernel: [5]
432
+ stride: [1]
433
+ dilation: [1]
434
+ dropout: 0.0
435
+ residual: false
436
+ separable: ${model.model_defaults.separable}
437
+ se: ${model.model_defaults.se}
438
+ se_context_size: ${model.model_defaults.se_context_size}
439
+
440
+ # Block 1
441
+ - filters: 512
442
+ repeat: ${model.model_defaults.repeat}
443
+ kernel: [11]
444
+ stride: [2]
445
+ dilation: [1]
446
+ dropout: ${model.model_defaults.dropout}
447
+ residual: true
448
+ separable: ${model.model_defaults.separable}
449
+ se: ${model.model_defaults.se}
450
+ se_context_size: ${model.model_defaults.se_context_size}
451
+ stride_last: true
452
+ residual_mode: "stride_add"
453
+
454
+ ... # Entries for blocks 2~21
455
+
456
+ # Block 22
457
+ - filters: 512
458
+ repeat: ${model.model_defaults.repeat}
459
+ kernel: [39]
460
+ stride: [1]
461
+ dilation: [1]
462
+ dropout: ${model.model_defaults.dropout}
463
+ residual: true
464
+ separable: ${model.model_defaults.separable}
465
+ se: ${model.model_defaults.se}
466
+ se_context_size: ${model.model_defaults.se_context_size}
467
+
468
+ # Epilogue block
469
+
470
+ - filters: &enc_final 640
471
+ repeat: 1
472
+ kernel: [41]
473
+ stride: [1]
474
+ dilation: [1]
475
+ dropout: 0.0
476
+ residual: false
477
+ separable: ${model.model_defaults.separable}
478
+ se: ${model.model_defaults.se}
479
+ se_context_size: ${model.model_defaults.se_context_size}
480
+
481
+ As mentioned above, Citrinet uses the ``ConvASRDecoder`` as the decoder layer similar to QuartzNet. Only the configuration must be
482
+ changed slightly as Citrinet utilizes sub-word tokenization.
483
+
484
+ .. note::
485
+ The following information is relevant to any of the above models that implements its encoder as an :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder`, and utilizes the ``SqueezeExcite`` mechanism.
486
+
487
+ The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin.change_conv_asr_se_context_window`
488
+
489
+ .. code-block:: python
490
+
491
+ # Here, model can be any model that has a `ConvASREncoder` as its encoder, and utilized `SqueezeExcite` blocks
492
+ # `context_window` : It is an integer representing the number of timeframes (each corresponding to some window stride).
493
+ # `update_config` : Bool flag which determines whether the config of the model should be updated to reflect the new context window.
494
+
495
+ # Here, we specify that 128 timeframes of 0.01s stride should be the context window
496
+ # This is equivalent to 128 * 0.01s context window for `SqueezeExcite`
497
+ model.change_conv_asr_se_context_window(context_window=128, update_config=True)
498
+
499
+ Conformer-CTC
500
+ ~~~~~~~~~~~~~
501
+
502
+ The config files for Conformer-CTC model contain character-based encoding and sub-word encoding at
503
+ ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_ctc_char.yaml`` and ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_ctc_bpe.yaml``
504
+ respectively. Some components of the configs of `Conformer-CTC <./models.html#Conformer-CTC>`__ include the following datasets:
505
+
506
+ * ``train_ds``, ``validation_ds``, and ``test_ds``
507
+ * opimizer (``optim``)
508
+ * augmentation (``spec_augment``)
509
+ * ``decoder``
510
+ * ``trainer``
511
+ * ``exp_manager``
512
+
513
+ These datasets are similar to other ASR models like `QuartzNet <./models.html#QuartzNet>`__. There should be a tokenizer section where you can
514
+ specify the tokenizer if you want to use sub-word encoding instead of character-based encoding.
515
+
516
+
517
+ The encoder section includes the details about the Conformer-CTC encoder architecture. You may find more information in the
518
+ config files and also :ref:`nemo.collections.asr.modules.ConformerEncoder <conformer-encoder-api>`.
519
+
520
+ Squeezeformer-CTC
521
+ ~~~~~~~~~~~~~~~~~
522
+
523
+ The config files for Squeezeformer-CTC model contain character-based encoding and sub-word encoding at
524
+ ``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and ``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``
525
+ respectively. Components of the configs of `Squeezeformer-CTC <./models.html#Squeezeformer-CTC>`__ are similar to Conformer config - `QuartzNet <./configs.html#Conformer-CTC>`__.
526
+
527
+ The encoder section includes the details about the Squeezeformer-CTC encoder architecture. You may find more information in the
528
+ config files and also :ref:`nemo.collections.asr.modules.SqueezeformerEncoder <squeezeformer-encoder-api>`.
529
+
530
+
531
+ ContextNet
532
+ ~~~~~~~~~~
533
+
534
+ Please refer to the model page of `ContextNet <./models.html#ContextNet>`__ for more information on this model.
535
+
536
+ Conformer-Transducer
537
+ ~~~~~~~~~~~~~~~~~~~~
538
+
539
+ Please refer to the model page of `Conformer-Transducer <./models.html#Conformer-Transducer>`__ for more information on this model.
540
+
541
+ LSTM-Transducer and LSTM-CTC
542
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
543
+
544
+ The config files for LSTM-Transducer and LSTM-CTC models can be found at ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_transducer_bpe.yaml`` and ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_ctc_bpe.yaml`` respectively.
545
+ Most of the of the configs of are similar to other ctc or transducer models. The main difference is the encoder part.
546
+ The encoder section includes the details about the RNN-based encoder architecture. You may find more information in the
547
+ config files and also :ref:`nemo.collections.asr.modules.RNNEncoder <rnn-encoder-api>`.
548
+
549
+
550
+ InterCTC Config
551
+ ---------------
552
+
553
+ All CTC-based models also support `InterCTC loss <https://arxiv.org/abs/2102.03216>`_. To use it, you need to specify
554
+ 2 parameters as in example below
555
+
556
+ .. code-block:: yaml
557
+
558
+ model:
559
+ # ...
560
+ interctc:
561
+ loss_weights: [0.3]
562
+ apply_at_layers: [8]
563
+
564
+ which can be used to reproduce the default setup from the paper (assuming the total number of layers is 18).
565
+ You can also specify multiple CTC losses from different layers, e.g., to get 2 losses from layers 3 and 8 with
566
+ weights 0.1 and 0.3, specify:
567
+
568
+ .. code-block:: yaml
569
+
570
+ model:
571
+ # ...
572
+ interctc:
573
+ loss_weights: [0.1, 0.3]
574
+ apply_at_layers: [3, 8]
575
+
576
+ Note that the final-layer CTC loss weight is automatically computed to normalize
577
+ all weight to 1 (0.6 in the example above).
578
+
579
+
580
+ Stochastic Depth Config
581
+ -----------------------
582
+
583
+ `Stochastic Depth <https://arxiv.org/abs/2102.03216>`_ is a useful technique for regularizing ASR model training.
584
+ Currently it's only supported for :ref:`nemo.collections.asr.modules.ConformerEncoder <conformer-encoder-api>`. To
585
+ use it, specify the following parameters in the encoder config file to reproduce the default setup from the paper:
586
+
587
+ .. code-block:: yaml
588
+
589
+ model:
590
+ # ...
591
+ encoder:
592
+ # ...
593
+ stochastic_depth_drop_prob: 0.3
594
+ stochastic_depth_mode: linear # linear or uniform
595
+ stochastic_depth_start_layer: 1
596
+
597
+ See :ref:`documentation of ConformerEncoder <conformer-encoder-api>` for more details. Note that stochastic depth
598
+ is supported for both CTC and Transducer model variations (or any other kind of model/loss that's using
599
+ conformer as encoder).
600
+
601
+
602
+ Transducer Configurations
603
+ -------------------------
604
+
605
+ All CTC-based ASR model configs can be modified to support Transducer loss training. Below, we discuss the modifications required in the config to enable Transducer training. All modifications are made to the ``model`` config.
606
+
607
+ Model Defaults
608
+ ~~~~~~~~~~~~~~
609
+
610
+ It is a subsection to the model config representing the default values shared across the entire model represented as ``model.model_defaults``.
611
+
612
+ There are three values that are primary components of a transducer model. They are :
613
+
614
+ * ``enc_hidden``: The hidden dimension of the final layer of the Encoder network.
615
+ * ``pred_hidden``: The hidden dimension of the final layer of the Prediction network.
616
+ * ``joint_hidden``: The hidden dimension of the intermediate layer of the Joint network.
617
+
618
+ One can access these values inside the config by using OmegaConf interpolation as follows :
619
+
620
+ .. code-block:: yaml
621
+
622
+ model:
623
+ ...
624
+ model_defaults:
625
+ enc_hidden: 256
626
+ pred_hidden: 256
627
+ joint_hidden: 256
628
+ ...
629
+ decoder:
630
+ ...
631
+ prednet:
632
+ pred_hidden: ${model.model_defaults.pred_hidden}
633
+
634
+ Acoustic Encoder Model
635
+ ~~~~~~~~~~~~~~~~~~~~~~
636
+
637
+ The transducer model is comprised of three models combined. One of these models is the Acoustic (encoder) model. We should be able to drop in any CTC Acoustic model config into this section of the transducer config.
638
+
639
+ The only condition that needs to be met is that **the final layer of the acoustic model must have the hidden dimension defined in ``model_defaults.enc_hidden``**.
640
+
641
+ Decoder / Prediction Model
642
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
643
+
644
+ The Prediction model is generally an autoregressive, causal model that consumes text tokens and returns embeddings that will be used by the Joint model. The base config for an LSTM based Prediction network can be found in the the ``decoder`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section.
645
+
646
+ **This config can be copy-pasted into any custom transducer model with no modification.**
647
+
648
+ Let us discuss some of the important arguments:
649
+
650
+ * ``blank_as_pad``: In ordinary transducer models, the embedding matrix does not acknowledge the ``Transducer Blank`` token (similar to CTC Blank). However, this causes the autoregressive loop to be more complicated and less efficient. Instead, this flag which is set by default, will add the ``Transducer Blank`` token to the embedding matrix - and use it as a pad value (zeros tensor). This enables more efficient inference without harming training. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section.
651
+
652
+ * ``prednet.pred_hidden``: The hidden dimension of the LSTM and the output dimension of the Prediction network.
653
+
654
+ .. code-block:: yaml
655
+
656
+ decoder:
657
+ _target_: nemo.collections.asr.modules.RNNTDecoder
658
+ normalization_mode: null
659
+ random_state_sampling: false
660
+ blank_as_pad: true
661
+
662
+ prednet:
663
+ pred_hidden: ${model.model_defaults.pred_hidden}
664
+ pred_rnn_layers: 1
665
+ t_max: null
666
+ dropout: 0.0
667
+
668
+ Joint Model
669
+ ~~~~~~~~~~~
670
+
671
+ The Joint model is a simple feed-forward Multi-Layer Perceptron network. This MLP accepts the output of the Acoustic and Prediction models and computes a joint probability distribution over the entire vocabulary space. The base config for the Joint network can be found in the the ``joint`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section.
672
+
673
+ **This config can be copy-pasted into any custom transducer model with no modification.**
674
+
675
+ The Joint model config has several essential components which we discuss below :
676
+
677
+ * ``log_softmax``: Due to the cost of computing softmax on such large tensors, the Numba CUDA implementation of RNNT loss will implicitly compute the log softmax when called (so its inputs should be logits). The CPU version of the loss doesn't face such memory issues so it requires log-probabilities instead. Since the behaviour is different for CPU-GPU, the ``None`` value will automatically switch behaviour dependent on whether the input tensor is on a CPU or GPU device.
678
+
679
+ * ``preserve_memory``: This flag will call ``torch.cuda.empty_cache()`` at certain critical sections when computing the Joint tensor. While this operation might allow us to preserve some memory, the empty_cache() operation is tremendously slow and will slow down training by an order of magnitude or more. It is available to use but not recommended.
680
+
681
+ * ``fuse_loss_wer``: This flag performs "batch splitting" and then "fused loss + metric" calculation. It will be discussed in detail in the next tutorial that will train a Transducer model.
682
+
683
+ * ``fused_batch_size``: When the above flag is set to True, the model will have two distinct "batch sizes". The batch size provided in the three data loader configs (``model.*_ds.batch_size``) will now be the ``Acoustic model`` batch size, whereas the ``fused_batch_size`` will be the batch size of the ``Prediction model``, the ``Joint model``, the ``transducer loss`` module and the ``decoding`` module.
684
+
685
+ * ``jointnet.joint_hidden``: The hidden intermediate dimension of the joint network.
686
+
687
+ .. code-block:: yaml
688
+
689
+ joint:
690
+ _target_: nemo.collections.asr.modules.RNNTJoint
691
+ log_softmax: null # sets it according to cpu/gpu device
692
+
693
+ # fused mode
694
+ fuse_loss_wer: false
695
+ fused_batch_size: 16
696
+
697
+ jointnet:
698
+ joint_hidden: ${model.model_defaults.joint_hidden}
699
+ activation: "relu"
700
+ dropout: 0.0
701
+
702
+ Sampled Softmax Joint Model
703
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
704
+
705
+ There are some situations where a large vocabulary with a Transducer model - such as for multilingual models with a large
706
+ number of languages. In this setting, we need to consider the cost of memory of training Transducer networks which does
707
+ not allow large vocabulary.
708
+
709
+ For such cases, one can instead utilize the ``SampledRNNTJoint`` module instead of the usual ``RNNTJoint`` module, in order
710
+ to compute the loss using a sampled subset of the vocabulary rather than the full vocabulary file.
711
+
712
+ It adds only one additional parameter :
713
+
714
+ * ``n_samples``: Specifies the minimum number of tokens to sample from the vocabulary space,
715
+ excluding the RNNT blank token. If a given value is larger than the entire vocabulary size,
716
+ then the full vocabulary will be used.
717
+
718
+ The only difference in config required is to replace ``nemo.collections.asr.modules.RNNTJoint`` with ``nemo.collections.asr.modules.SampledRNNTJoint``
719
+
720
+ .. code-block:: yaml
721
+
722
+ joint:
723
+ _target_: nemo.collections.asr.modules.SampledRNNTJoint
724
+ n_samples: 500
725
+ ... # All other arguments from RNNTJoint can be used after this.
726
+
727
+
728
+ Effect of Batch Splitting / Fused Batch step
729
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
730
+
731
+ The following information below explain why memory is an issue when training Transducer models and how NeMo tackles the issue with its Fused Batch step. The material can be read for a thorough understanding, otherwise, it can be skipped. You can also follow these steps in the "ASR_with_Transducers" tutorial.
732
+
733
+ **Diving deeper into the memory costs of Transducer Joint**
734
+
735
+ One of the significant limitations of Transducers is the exorbitant memory cost of computing the Joint module. The Joint module is comprised of two steps.
736
+
737
+ 1) Projecting the Acoustic and Transcription feature dimensions to some standard hidden dimension (specified by model.model_defaults.joint_hidden)
738
+
739
+ 2) Projecting this intermediate hidden dimension to the final vocabulary space to obtain the transcription.
740
+
741
+ Take the following example.
742
+
743
+ BS=32 ; T (after 2x stride) = 800, U (with character encoding) = 400-450 tokens, Vocabulary size V = 28 (26 alphabet chars, space and apostrophe). Let the hidden dimension of the Joint model be 640 (Most Google Transducer papers use hidden dimension of 640).
744
+
745
+ * :math:`Memory \, (Hidden, \, gb) = 32 \times 800 \times 450 \times 640 \times 4 = 29.49` gigabytes (4 bytes per float).
746
+
747
+ * :math:`Memory \, (Joint, \, gb) = 32 \times 800 \times 450 \times 28 \times 4 = 1.290` gigabytes (4 bytes per float)
748
+
749
+ **NOTE**: This is just for the forward pass! We need to double this memory to store gradients! This much memory is also just for the Joint model **alone**. Far more memory is required for the Prediction model as well as the large Acoustic model itself and its gradients!
750
+
751
+ Even with mixed precision, that's $\sim 30$ GB of GPU RAM for just 1 part of the network + its gradients.
752
+
753
+ Effect of Fused Batch Step
754
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
755
+
756
+ The fundamental problem is that the joint tensor grows in size when ``[T x U]`` grows in size. This growth in memory cost is due to many reasons - either by model construction (downsampling) or the choice of dataset preprocessing (character tokenization vs. sub-word tokenization).
757
+
758
+ Another dimension that NeMo can control is **batch**. Due to how we batch our samples, small and large samples all get clumped together into a single batch. So even though the individual samples are not all as long as the maximum length of T and U in that batch, when a batch of such samples is constructed, it will consume a significant amount of memory for the sake of compute efficiency.
759
+
760
+ So as is always the case - **trade-off compute speed for memory savings**.
761
+
762
+ The fused operation goes as follows :
763
+
764
+ 1) Forward the entire acoustic model in a single pass. (Use global batch size here for acoustic model - found in ``model.*_ds.batch_size``)
765
+
766
+ 2) Split the Acoustic Model's logits by ``fused_batch_size`` and loop over these sub-batches.
767
+
768
+ 3) Construct a sub-batch of same ``fused_batch_size`` for the Prediction model. Now the target sequence length is :math:`U_{sub-batch} < U`.
769
+
770
+ 4) Feed this :math:`U_{sub-batch}` into the Joint model, along with a sub-batch from the Acoustic model (with :math:`T_{sub-batch} < T)`. Remember, we only have to slice off a part of the acoustic model here since we have the full batch of samples :math:`(B, T, D)` from the acoustic model.
771
+
772
+ 5) Performing steps (3) and (4) yields :math:`T_{sub-batch}` and :math:`U_{sub-batch}`. Perform sub-batch joint step - costing an intermediate :math:`(B, T_{sub-batch}, U_{sub-batch}, V)` in memory.
773
+
774
+ 6) Compute loss on sub-batch and preserve in a list to be later concatenated.
775
+
776
+ 7) Compute sub-batch metrics (such as Character / Word Error Rate) using the above Joint tensor and sub-batch of ground truth labels. Preserve the scores to be averaged across the entire batch later.
777
+
778
+ 8) Delete the sub-batch joint matrix :math:`(B, T_{sub-batch}, U_{sub-batch}, V)`. Only gradients from .backward() are preserved now in the computation graph.
779
+
780
+ 9) Repeat steps (3) - (8) until all sub-batches are consumed.
781
+
782
+ 10) Cleanup step. Compute full batch WER and log. Concatenate loss list and pass to PTL to compute the equivalent of the original (full batch) Joint step. Delete ancillary objects necessary for sub-batching.
783
+
784
+ Transducer Decoding
785
+ ~~~~~~~~~~~~~~~~~~~
786
+
787
+ Models which have been trained with CTC can transcribe text simply by performing a regular argmax over the output of their decoder. For transducer-based models, the three networks must operate in a synchronized manner in order to transcribe the acoustic features. The base config for the Transducer decoding step can be found in the the ``decoding`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section.
788
+
789
+ **This config can be copy-pasted into any custom transducer model with no modification.**
790
+
791
+ The most important component at the top level is the ``strategy``. It can take one of many values:
792
+
793
+ * ``greedy``: This is sample-level greedy decoding. It is generally exceptionally slow as each sample in the batch will be decoded independently. For publications, this should be used alongside batch size of 1 for exact results.
794
+
795
+ * ``greedy_batch``: This is the general default and should nearly match the ``greedy`` decoding scores (if the acoustic features are not affected by feature mixing in batch mode). Even for small batch sizes, this strategy is significantly faster than ``greedy``.
796
+
797
+ * ``beam``: Runs beam search with the implicit language model of the Prediction model. It will generally be quite slow, and might need some tuning of the beam size to get better transcriptions.
798
+
799
+ * ``tsd``: Time synchronous decoding. Please refer to the paper: `Alignment-Length Synchronous Decoding for RNN Transducer <https://ieeexplore.ieee.org/document/9053040>`_ for details on the algorithm implemented. Time synchronous decoding (TSD) execution time grows by the factor T * max_symmetric_expansions. For longer sequences, T is greater and can therefore take a long time for beams to obtain good results. TSD also requires more memory to execute.
800
+
801
+ * ``alsd``: Alignment-length synchronous decoding. Please refer to the paper: `Alignment-Length Synchronous Decoding for RNN Transducer <https://ieeexplore.ieee.org/document/9053040>`_ for details on the algorithm implemented. Alignment-length synchronous decoding (ALSD) execution time is faster than TSD, with a growth factor of T + U_max, where U_max is the maximum target length expected during execution. Generally, T + U_max < T * max_symmetric_expansions. However, ALSD beams are non-unique. Therefore it is required to use larger beam sizes to achieve the same (or close to the same) decoding accuracy as TSD. For a given decoding accuracy, it is possible to attain faster decoding via ALSD than TSD.
802
+
803
+ * ``maes``: Modified Adaptive Expansion Search Decoding. Please refer to the paper `Accelerating RNN Transducer Inference via Adaptive Expansion Search <https://ieeexplore.ieee.org/document/9250505>`_. Modified Adaptive Synchronous Decoding (mAES) execution time is adaptive w.r.t the number of expansions (for tokens) required per timestep. The number of expansions can usually be constrained to 1 or 2, and in most cases 2 is sufficient. This beam search technique can possibly obtain superior WER while sacrificing some evaluation time.
804
+
805
+ .. code-block:: yaml
806
+
807
+ decoding:
808
+ strategy: "greedy_batch"
809
+
810
+ # preserve decoding alignments
811
+ preserve_alignments: false
812
+
813
+ # Overrides the fused batch size after training.
814
+ # Setting it to -1 will process whole batch at once when combined with `greedy_batch` decoding strategy
815
+ fused_batch_size: Optional[int] = -1
816
+
817
+ # greedy strategy config
818
+ greedy:
819
+ max_symbols: 10
820
+
821
+ # beam strategy config
822
+ beam:
823
+ beam_size: 2
824
+ score_norm: true
825
+ softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax
826
+ tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0
827
+ alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0
828
+ maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0
829
+ maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0
830
+ maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0
831
+ maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0
832
+
833
+ Transducer Loss
834
+ ~~~~~~~~~~~~~~~
835
+
836
+ This section configures the type of Transducer loss itself, along with possible sub-sections. By default, an optimized implementation of Transducer loss will be used which depends on Numba for CUDA acceleration. The base config for the Transducer loss section can be found in the the ``loss`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section.
837
+
838
+ **This config can be copy-pasted into any custom transducer model with no modification.**
839
+
840
+ The loss config is based on a resolver pattern and can be used as follows:
841
+
842
+ 1) ``loss_name``: ``default`` is generally a good option. Will select one of the available resolved losses and match the kwargs from a sub-configs passed via explicit ``{loss_name}_kwargs`` sub-config.
843
+
844
+ 2) ``{loss_name}_kwargs``: This sub-config is passed to the resolved loss above and can be used to configure the resolved loss.
845
+
846
+
847
+ .. code-block:: yaml
848
+
849
+ loss:
850
+ loss_name: "default"
851
+ warprnnt_numba_kwargs:
852
+ fastemit_lambda: 0.0
853
+
854
+ FastEmit Regularization
855
+ ^^^^^^^^^^^^^^^^^^^^^^^
856
+
857
+ FastEmit Regularization is supported for the default Numba based WarpRNNT loss. Recently proposed regularization approach - `FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization <https://arxiv.org/abs/2010.11148>`_ allows us near-direct control over the latency of transducer models.
858
+
859
+ Refer to the above paper for results and recommendations of ``fastemit_lambda``.
860
+
861
+
862
+ Fine-tuning Configurations
863
+ --------------------------
864
+
865
+ All ASR scripts support easy fine-tuning by partially/fully loading the pretrained weights from a checkpoint into the **currently instantiated model**. Note that the currently instantiated model should have parameters that match the pre-trained checkpoint (such that weights may load properly). In order to directly fine-tune a pre-existing checkpoint, please follow the tutorial `ASR Language Fine-tuning. <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb>`_
866
+
867
+ Pre-trained weights can be provided in multiple ways -
868
+
869
+ 1) Providing a path to a NeMo model (via ``init_from_nemo_model``)
870
+ 2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``)
871
+ 3) Providing a path to a Pytorch Lightning checkpoint file (via ``init_from_ptl_ckpt``)
872
+
873
+ There are multiple ASR subtasks inside the ``examples/asr/`` directory, you can substitute the ``<subtask>`` tag below.
874
+
875
+ Fine-tuning via a NeMo model
876
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
877
+
878
+ .. code-block:: sh
879
+
880
+ python examples/asr/<subtask>/script_to_<script_name>.py \
881
+ --config-path=<path to dir of configs> \
882
+ --config-name=<name of config without .yaml>) \
883
+ model.train_ds.manifest_filepath="<path to manifest file>" \
884
+ model.validation_ds.manifest_filepath="<path to manifest file>" \
885
+ trainer.devices=-1 \
886
+ trainer.accelerator='gpu' \
887
+ trainer.max_epochs=50 \
888
+ +init_from_nemo_model="<path to .nemo model file>"
889
+
890
+
891
+ Fine-tuning via a NeMo pretrained model name
892
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
893
+
894
+ .. code-block:: sh
895
+
896
+ python examples/asr/<subtask>/script_to_<script_name>.py \
897
+ --config-path=<path to dir of configs> \
898
+ --config-name=<name of config without .yaml>) \
899
+ model.train_ds.manifest_filepath="<path to manifest file>" \
900
+ model.validation_ds.manifest_filepath="<path to manifest file>" \
901
+ trainer.devices=-1 \
902
+ trainer.accelerator='gpu' \
903
+ trainer.max_epochs=50 \
904
+ +init_from_pretrained_model="<name of pretrained checkpoint>"
905
+
906
+ Fine-tuning via a Pytorch Lightning checkpoint
907
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
908
+
909
+ .. code-block:: sh
910
+
911
+ python examples/asr/<subtask>/script_to_<script_name>.py \
912
+ --config-path=<path to dir of configs> \
913
+ --config-name=<name of config without .yaml>) \
914
+ model.train_ds.manifest_filepath="<path to manifest file>" \
915
+ model.validation_ds.manifest_filepath="<path to manifest file>" \
916
+ trainer.devices=-1 \
917
+ trainer.accelerator='gpu' \
918
+ trainer.max_epochs=50 \
919
+ +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
920
+
921
+ Fine-tuning Execution Flow Diagram
922
+ ----------------------------------
923
+
924
+ When preparing your own training or fine-tuning scripts, please follow the execution flow diagram order for correct inference.
925
+
926
+ Depending on the type of model, there may be extra steps that must be performed -
927
+
928
+ * CTC Models - `Examples directory for CTC Models <https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/asr_ctc/README.md>`_
929
+ * RNN Transducer Models - `Examples directory for Transducer Models <https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/asr_transducer/README.md>`_
docs/source/asr/data/asrlm_results.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Model Name,Model Base Class,Model Card
2
+ asrlm_en_transformer_large_ls,TransformerLMModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:asrlm_en_transformer_large_ls"
docs/source/asr/data/benchmark_ca.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_ca_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5"
3
+ stt_ca_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large"
4
+ stt_ca_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_transducer_large"
docs/source/asr/data/benchmark_de.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_de_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5"
3
+ stt_de_citrinet_1024,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024"
4
+ stt_de_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_contextnet_1024"
5
+ stt_de_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large"
6
+ stt_de_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_transducer_large"
docs/source/asr/data/benchmark_en.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Name,Model Base Class,Model Card
2
+ QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
3
+ stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
4
+ stt_en_citrinet_256,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
5
+ stt_en_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
6
+ stt_en_citrinet_1024,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
7
+ stt_en_citrinet_256_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25"
8
+ stt_en_citrinet_512_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25"
9
+ stt_en_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25"
10
+ stt_en_contextnet_256_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256_mls"
11
+ stt_en_contextnet_512_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512_mls"
12
+ stt_en_contextnet_1024_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024_mls"
13
+ stt_en_contextnet_256,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256"
14
+ stt_en_contextnet_512,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512"
15
+ stt_en_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024"
16
+ stt_en_conformer_ctc_small,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
17
+ stt_en_conformer_ctc_medium,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
18
+ stt_en_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
19
+ stt_en_conformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge"
20
+ stt_en_conformer_ctc_small_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls"
21
+ stt_en_conformer_ctc_medium_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls"
22
+ stt_en_conformer_ctc_large_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls"
23
+ stt_en_conformer_transducer_large_ls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large_ls"
24
+ stt_en_conformer_transducer_small,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_small"
25
+ stt_en_conformer_transducer_medium,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_medium"
26
+ stt_en_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large"
27
+ stt_en_conformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge"
28
+ stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge"
docs/source/asr/data/benchmark_es.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
3
+ stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512"
4
+ stt_es_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25"
5
+ stt_es_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large"
6
+ stt_es_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_transducer_large"
7
+ stt_es_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_contextnet_1024"
docs/source/asr/data/benchmark_fr.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_fr_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5"
3
+ stt_fr_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_citrinet_1024_gamma_0_25"
4
+ stt_fr_no_hyphen_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_citrinet_1024_gamma_0_25"
5
+ stt_fr_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_contextnet_1024"
6
+ stt_fr_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_ctc_large"
7
+ stt_fr_no_hyphen_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_ctc_large"
8
+ stt_fr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_transducer_large"
docs/source/asr/data/benchmark_hi.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Model Name,Model Base Class,Model Card
2
+ stt_hi_conformer_ctc_medium,EncDecCTCModelBPE,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_hi_conformer_ctc_medium"
docs/source/asr/data/benchmark_hr.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_hr_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large"
3
+ stt_hr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large"
docs/source/asr/data/benchmark_it.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5"
3
+
docs/source/asr/data/benchmark_kab.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_kab_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_kab_conformer_transducer_large"
docs/source/asr/data/benchmark_mr.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Model Name,Model Base Class,Model Card
2
+ stt_mr_conformer_ctc_medium,EncDecCTCModelBPE,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_mr_conformer_ctc_medium"
3
+
docs/source/asr/data/benchmark_pl.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5"
docs/source/asr/data/benchmark_ru.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_ru_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5"
3
+
docs/source/asr/data/benchmark_rw.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_rw_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large"
3
+ stt_rw_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_transducer_large"
docs/source/asr/data/benchmark_zh.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Model,Model Base Class,Model Card
2
+ stt_zh_citrinet_512,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
3
+ stt_zh_citrinet_1024_gamma_0_25,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25"
4
+ stt_zh_conformer_transducer_large,EncDecRNNTModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_conformer_transducer_large"