NeMo / examples /asr /export /transducer /infer_transducer_ts.py

thanks to NVIDIA ❤

7934b29 almost 2 years ago

9.59 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import glob
	import json
	import os
	import tempfile
	from argparse import ArgumentParser

	import torch
	from omegaconf import OmegaConf
	from tqdm import tqdm

	from nemo.collections.asr.metrics.wer import word_error_rate
	from nemo.collections.asr.models import ASRModel
	from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import TorchscriptGreedyBatchedRNNTInfer
	from nemo.utils import logging


	"""
	Script to compare the outputs of a NeMo Pytorch based RNNT Model and its Torchscript exported representation.

	# Compare a NeMo and Torchscript model
	python infer_transducer_ts.py \
	--nemo_model="<path to a .nemo file>" \
	OR
	--pretrained_model="<name of a pretrained model>" \
	--ts_encoder="<path to ts encoder file>" \
	--ts_decoder="<path to ts decoder-joint file>" \
	--ts_cfg="<path to a export ts model's config file>" \
	--dataset_manifest="<Either pass a manifest file path here>" \
	--audio_dir="<Or pass a directory containing preprocessed monochannel audio files>" \
	--max_symbold_per_step=5 \
	--batch_size=32 \
	--log

	# Export and compare a NeMo and Torchscript model
	python infer_transducer_ts.py \
	--nemo_model="<path to a .nemo file>" \
	OR
	--pretrained_model="<name of a pretrained model>" \
	--export \
	--dataset_manifest="<Either pass a manifest file path here>" \
	--audio_dir="<Or pass a directory containing preprocessed monochannel audio files>" \
	--max_symbold_per_step=5 \
	--batch_size=32 \
	--log

	"""


	def parse_arguments():
	parser = ArgumentParser()
	parser.add_argument(
	"--nemo_model", type=str, default=None, required=False, help="Path to .nemo file",
	)
	parser.add_argument(
	'--pretrained_model', type=str, default=None, required=False, help='Name of a pretrained NeMo file'
	)
	parser.add_argument('--ts_encoder', type=str, default=None, required=False, help="Path to ts encoder model")
	parser.add_argument(
	'--ts_decoder', type=str, default=None, required=False, help="Path to ts decoder + joint model"
	)
	parser.add_argument(
	'--ts_cfg', type=str, default=None, required=False, help='Path to the yaml config of the exported model'
	)
	parser.add_argument('--threshold', type=float, default=0.01, required=False)

	parser.add_argument('--dataset_manifest', type=str, default=None, required=False, help='Path to dataset manifest')
	parser.add_argument('--audio_dir', type=str, default=None, required=False, help='Path to directory of audio files')
	parser.add_argument('--audio_type', type=str, default='wav', help='File format of audio')

	parser.add_argument(
	'--export', action='store_true', help="Whether to export the model into torchscript prior to eval"
	)
	parser.add_argument('--max_symbold_per_step', type=int, default=5, required=False, help='Number of decoding steps')
	parser.add_argument('--batch_size', type=int, default=32, help='Batchsize')
	parser.add_argument('--log', action='store_true', help='Log the predictions between pytorch and torchscript')

	args = parser.parse_args()
	return args


	def assert_args(args):
	if args.nemo_model is None and args.pretrained_model is None:
	raise ValueError(
	"`nemo_model` or `pretrained_model` must be passed ! It is required for decoding the RNNT tokens "
	"and ensuring predictions match between Torch and Torchscript."
	)

	if args.nemo_model is not None and args.pretrained_model is not None:
	raise ValueError(
	"`nemo_model` and `pretrained_model` cannot both be passed ! Only one can be passed to this script."
	)

	if args.ts_cfg is None:
	raise ValueError(
	"Must provide the yaml config of the exported model. You can obtain it by loading the "
	"nemo model and then using OmegaConf.save(model.cfg, 'cfg.yaml')"
	)

	if args.export and (args.ts_encoder is not None or args.ts_decoder is not None):
	raise ValueError("If `export` is set, then `ts_encoder` and `ts_decoder` arguments must be None")

	if args.audio_dir is None and args.dataset_manifest is None:
	raise ValueError("Both `dataset_manifest` and `audio_dir` cannot be None!")

	if args.audio_dir is not None and args.dataset_manifest is not None:
	raise ValueError("Submit either `dataset_manifest` or `audio_dir`.")

	if int(args.max_symbold_per_step) < 1:
	raise ValueError("`max_symbold_per_step` must be an integer > 0")


	def export_model_if_required(args, nemo_model):
	if args.export:
	nemo_model.export(output="temp_rnnt.ts", check_trace=True)
	OmegaConf.save(nemo_model.cfg, "ts_cfg.yaml")

	args.ts_encoder = "encoder-temp_rnnt.ts"
	args.ts_decoder = "decoder_joint-temp_rnnt.ts"
	args.ts_cfg = "ts_cfg.yaml"


	def resolve_audio_filepaths(args):
	# get audio filenames
	if args.audio_dir is not None:
	filepaths = list(glob.glob(os.path.join(args.audio_dir.audio_dir, f"*.{args.audio_type}")))
	else:
	# get filenames from manifest
	filepaths = []
	with open(args.dataset_manifest, 'r', encoding='utf-8') as f:
	for line in f:
	item = json.loads(line)
	filepaths.append(item['audio_filepath'])

	logging.info(f"\nTranscribing {len(filepaths)} files...\n")

	return filepaths


	def main():
	args = parse_arguments()

	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Instantiate pytorch model
	if args.nemo_model is not None:
	nemo_model = args.nemo_model
	nemo_model = ASRModel.restore_from(nemo_model, map_location=device) # type: ASRModel
	nemo_model.freeze()
	elif args.pretrained_model is not None:
	nemo_model = args.pretrained_model
	nemo_model = ASRModel.from_pretrained(nemo_model, map_location=device) # type: ASRModel
	nemo_model.freeze()
	else:
	raise ValueError("Please pass either `nemo_model` or `pretrained_model` !")

	if torch.cuda.is_available():
	nemo_model = nemo_model.to('cuda')

	export_model_if_required(args, nemo_model)

	# Instantiate RNNT Decoding loop
	encoder_model = args.ts_encoder
	decoder_model = args.ts_decoder
	ts_cfg = OmegaConf.load(args.ts_cfg)
	max_symbols_per_step = args.max_symbold_per_step
	decoding = TorchscriptGreedyBatchedRNNTInfer(encoder_model, decoder_model, ts_cfg, device, max_symbols_per_step)

	audio_filepath = resolve_audio_filepaths(args)

	# Evaluate Pytorch Model (CPU/GPU)
	actual_transcripts = nemo_model.transcribe(audio_filepath, batch_size=args.batch_size)[0]

	# Evaluate Torchscript model
	with tempfile.TemporaryDirectory() as tmpdir:
	with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp:
	for audio_file in audio_filepath:
	entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'}
	fp.write(json.dumps(entry) + '\n')

	config = {'paths2audio_files': audio_filepath, 'batch_size': args.batch_size, 'temp_dir': tmpdir}

	nemo_model.preprocessor.featurizer.dither = 0.0
	nemo_model.preprocessor.featurizer.pad_to = 0

	temporary_datalayer = nemo_model._setup_transcribe_dataloader(config)

	all_hypothesis = []
	for test_batch in tqdm(temporary_datalayer, desc="Torchscript Transcribing"):
	input_signal, input_signal_length = test_batch[0], test_batch[1]
	input_signal = input_signal.to(device)
	input_signal_length = input_signal_length.to(device)

	# Acoustic features
	processed_audio, processed_audio_len = nemo_model.preprocessor(
	input_signal=input_signal, length=input_signal_length
	)
	# RNNT Decoding loop
	hypotheses = decoding(audio_signal=processed_audio, length=processed_audio_len)

	# Process hypothesis (map char/subword token ids to text)
	hypotheses = nemo_model.decoding.decode_hypothesis(hypotheses) # type: List[str]

	# Extract text from the hypothesis
	texts = [h.text for h in hypotheses]

	all_hypothesis += texts
	del processed_audio, processed_audio_len
	del test_batch

	if args.log:
	for pt_transcript, ts_transcript in zip(actual_transcripts, all_hypothesis):
	print(f"Pytorch Transcripts : {pt_transcript}")
	print(f"Torchscript Transcripts : {ts_transcript}")
	print()

	# Measure error rate between torchscript and pytorch transcipts
	pt_ts_cer = word_error_rate(all_hypothesis, actual_transcripts, use_cer=True)
	assert pt_ts_cer < args.threshold, "Threshold violation !"

	print("Character error rate between Pytorch and Torchscript :", pt_ts_cer)


	if __name__ == '__main__':
	main() # noqa pylint: disable=no-value-for-parameter