{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HuggingFace challenge - Debugger notebook\n", "Run this notebook to verify your libraries versions, check GPU config and run a quick training" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "T2utsYSKszvv" }, "outputs": [], "source": [ "import platform\n", "import multiprocessing\n", "\n", "import torch\n", "import transformers\n", "import datasets\n", "\n", "import soundfile" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Print main infos" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5P6I-W9ts-kR", "outputId": "939bd550-1486-46a6-8371-e82ada0f448c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n", "CPU cores: 60\n", "Python version: 3.8.8\n", "PyTorch version: 1.10.1+cu102\n", "GPU is visible: True\n", "Transformers version: 4.16.0.dev0\n", "Datasets version: 1.17.1.dev0\n", "soundfile version: 0.10.3\n" ] } ], "source": [ "print(f\"Platform: {platform.platform()}\")\n", "print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n", "\n", "print(f\"Python version: {platform.python_version()}\")\n", "\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "print(f\"GPU is visible: {torch.cuda.is_available()}\")\n", "\n", "print(f\"Transformers version: {transformers.__version__}\")\n", "print(f\"Datasets version: {datasets.__version__}\")\n", "\n", "print(f\"soundfile version: {soundfile.__version__}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check your GPU informations (if any)\n", "If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n", "Driver and CUDA version " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YT7fRnKctggU", "outputId": "f355a3e0-20da-489f-bd1f-5e508e792a68" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Thu Jan 27 11:15:57 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n", "| N/A 35C P0 25W / 250W | 4MiB / 32510MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2fa897b4afc049229144599af9e3f807", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
\\n] 29.64K --.-KB/s in 0.001s \n", "\n", "2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]\n", "\n" ] } ], "source": [ "!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# \t--learning_rate=\"7.5e-5\" \\\n", "# 84.5" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mz4bubhxxsad", "outputId": "23398525-cc19-43c2-9fec-497e06214f29" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "01/27/2022 12:08:42 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n", "01/27/2022 12:08:42 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=False,\n", "do_train=True,\n", "eval_accumulation_steps=None,\n", "eval_steps=500,\n", "evaluation_strategy=IntervalStrategy.STEPS,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=True,\n", "greater_is_better=None,\n", "group_by_length=True,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_strategy=HubStrategy.EVERY_SAVE,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=0.0003,\n", "length_column_name=input_length,\n", "load_best_model_at_end=False,\n", "local_rank=-1,\n", "log_level=-1,\n", "log_level_replica=-1,\n", "log_on_each_node=True,\n", "logging_dir=./wav2vec2-large-xls-r-300m-georgian/runs/Jan27_12-08-42_job-8be8b741-e32e-4579-bbec-1e00d9824b4f,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=IntervalStrategy.STEPS,\n", "lr_scheduler_type=SchedulerType.LINEAR,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=100.0,\n", "optim=OptimizerNames.ADAMW_HF,\n", "output_dir=./wav2vec2-large-xls-r-300m-georgian,\n", "overwrite_output_dir=True,\n", "past_index=-1,\n", "per_device_eval_batch_size=32,\n", "per_device_train_batch_size=32,\n", "prediction_loss_only=False,\n", "push_to_hub=True,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=./wav2vec2-large-xls-r-300m-georgian,\n", "save_on_each_node=False,\n", "save_steps=500,\n", "save_strategy=IntervalStrategy.STEPS,\n", "save_total_limit=2,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_legacy_prediction_loop=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=500,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "01/27/2022 12:08:44 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n", "01/27/2022 12:08:46 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n", "01/27/2022 12:08:46 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b/cache-ee96a598f17b1f41.arrow\n", "01/27/2022 12:08:46 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b/cache-d682850d734dbca0.arrow\n", "loading configuration file https://huggingface.co./facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 4.06ba/s]\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.06ba/s]\n", "Didn't find file ./wav2vec2-large-xls-r-300m-georgian/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/vocab.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/tokenizer_config.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/added_tokens.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/special_tokens_map.json\n", "loading file None\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "loading configuration file https://huggingface.co./facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file https://huggingface.co./facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "loading weights file https://huggingface.co./facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n", "Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.bias', 'project_q.weight', 'quantizer.codevectors', 'project_hid.weight', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_hid.bias']\n", "- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "preprocess datasets: 100%|█████████████████| 2686/2686 [00:26<00:00, 101.24ex/s]\n", "preprocess datasets: 100%|█████████████████| 1225/1225 [00:11<00:00, 107.06ex/s]\n", "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 473.24ba/s]\n", "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 625.36ba/s]\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-georgian/preprocessor_config.json\n", "tokenizer config file saved in ./wav2vec2-large-xls-r-300m-georgian/tokenizer_config.json\n", "Special tokens file saved in ./wav2vec2-large-xls-r-300m-georgian/special_tokens_map.json\n", "added tokens file saved in ./wav2vec2-large-xls-r-300m-georgian/added_tokens.json\n", "Configuration saved in ./wav2vec2-large-xls-r-300m-georgian/config.json\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-georgian/preprocessor_config.json\n", "loading configuration file ./wav2vec2-large-xls-r-300m-georgian/config.json\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-georgian\",\n", " \"activation_dropout\": 0.1,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"mean\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.0,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.0,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.0,\n", " \"mask_feature_length\": 64,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.25,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.75,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 36,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.0.dev0\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 39,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-georgian/preprocessor_config.json\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "Didn't find file ./wav2vec2-large-xls-r-300m-georgian/tokenizer.json. We won't load it.\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/vocab.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/tokenizer_config.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/added_tokens.json\n", "loading file ./wav2vec2-large-xls-r-300m-georgian/special_tokens_map.json\n", "loading file None\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "/workspace/votic_training/./wav2vec2-large-xls-r-300m-georgian is already a clone of https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian. Make sure you pull the latest changes with `repo.git_pull()`.\n", "01/27/2022 12:09:40 - WARNING - huggingface_hub.repository - /workspace/votic_training/./wav2vec2-large-xls-r-300m-georgian is already a clone of https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian. Make sure you pull the latest changes with `repo.git_pull()`.\n", "Using amp half precision backend\n", "The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "***** Running training *****\n", " Num examples = 2686\n", " Num Epochs = 100\n", " Instantaneous batch size per device = 32\n", " Total train batch size (w. parallel, distributed & accumulation) = 32\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 8400\n", "{'loss': 6.951, 'learning_rate': 5.88e-05, 'epoch': 1.19} \n", "{'loss': 3.1694, 'learning_rate': 0.0001188, 'epoch': 2.38} \n", "{'loss': 3.0357, 'learning_rate': 0.00017879999999999998, 'epoch': 3.57} \n", "{'loss': 2.8568, 'learning_rate': 0.0002388, 'epoch': 4.76} \n", "{'loss': 1.8805, 'learning_rate': 0.0002988, 'epoch': 5.95} \n", " 6%|██▎ | 500/8400 [14:21<2:21:00, 1.07s/it]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n", "***** Running Evaluation *****\n", " Num examples = 1225\n", " Batch size = 32\n", "\n", " 0%| | 0/39 [00:00 main\n", "\n", "01/27/2022 16:48:08 - WARNING - huggingface_hub.repository - To https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian\n", " 2dc9d89..464889d main -> main\n", "\n", "Upload file pytorch_model.bin: 100%|███████| 1.18G/1.18G [02:06<00:00, 9.96MB/s]\n", "Dropping the following result as it does not have all the necessary fields:\n", "{'dataset': {'name': 'MOZILLA-FOUNDATION/COMMON_VOICE_7_0 - KA', 'type': 'common_voice', 'args': 'Config: ka, Training split: train+validation, Eval split: test'}}\n", "To https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian\n", " 464889d..6e7a45b main -> main\n", "\n", "01/27/2022 16:48:15 - WARNING - huggingface_hub.repository - To https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian\n", " 464889d..6e7a45b main -> main\n", "\n" ] } ], "source": [ "!python run_speech_recognition_ctc.py \\\n", "\t--dataset_name=\"mozilla-foundation/common_voice_7_0\" \\\n", "\t--model_name_or_path=\"facebook/wav2vec2-xls-r-300m\" \\\n", "\t--dataset_config_name=\"ka\" \\\n", "\t--output_dir=\"./wav2vec2-large-xls-r-300m-georgian\" \\\n", "\t--overwrite_output_dir \\\n", "\t--num_train_epochs=\"100\" \\\n", "\t--per_device_train_batch_size=\"32\" \\\n", "\t--per_device_eval_batch_size=\"32\" \\\n", "\t--gradient_accumulation_steps=\"1\" \\\n", "\t--learning_rate=\"3e-4\" \\\n", "\t--warmup_steps=\"500\" \\\n", "\t--length_column_name=\"input_length\" \\\n", "\t--evaluation_strategy=\"steps\" \\\n", "\t--text_column_name=\"sentence\" \\\n", "\t--chars_to_ignore , ? . ! \\- \\; \\: \\\" “ % ‘ ” � — ’ … – \\\n", "\t--save_steps=\"500\" \\\n", "\t--eval_steps=\"500\" \\\n", "\t--logging_steps=\"100\" \\\n", "\t--layerdrop=\"0.0\" \\\n", "\t--activation_dropout=\"0.1\" \\\n", "\t--save_total_limit=\"2\" \\\n", "\t--freeze_feature_encoder \\\n", "\t--feat_proj_dropout=\"0.0\" \\\n", "\t--mask_time_prob=\"0.75\" \\\n", "\t--mask_time_length=\"10\" \\\n", "\t--mask_feature_prob=\"0.25\" \\\n", "\t--mask_feature_length=\"64\" \\\n", "\t--gradient_checkpointing \\\n", "\t--use_auth_token \\\n", "\t--fp16 \\\n", "\t--group_by_length \\\n", "\t--do_train --do_eval \\\n", " --push_to_hub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !rm -rf wav2vec2-large-xls-r-300m-bashkir" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!ls -ltr" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Filesystem Size Used Avail Use% Mounted on\n", "overlay 3.5T 1.2T 2.2T 34% /\n", "tmpfs 64M 0 64M 0% /dev\n", "tmpfs 87G 0 87G 0% /sys/fs/cgroup\n", "tmpfs 87G 0 87G 0% /dev/shm\n", "/dev/md0 3.5T 1.2T 2.2T 34% /etc/group\n", "tmpfs 87G 12K 87G 1% /proc/driver/nvidia\n", "/dev/vda1 49G 6.5G 42G 14% /usr/bin/nvidia-smi\n", "udev 87G 0 87G 0% /dev/nvidia0\n", "tmpfs 87G 0 87G 0% /proc/acpi\n", "tmpfs 87G 0 87G 0% /proc/scsi\n", "tmpfs 87G 0 87G 0% /sys/firmware\n" ] } ], "source": [ "!df -h" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n", "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2686\n" ] } ], "source": [ "from datasets import load_dataset, load_metric, Audio\n", "\n", "common_voice_train = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"ka\", use_auth_token=True, split=\"train+validation\")\n", "common_voice_test = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"ka\", use_auth_token=True, split=\"test\")\n", "\n", "print(len(common_voice_train))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", " num_rows: 2686\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_voice_train" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8393.75" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(common_voice_train) * 100 / 32" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n", "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from datasets import ClassLabel\n", "import random\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "def show_random_elements(dataset, num_examples=10):\n", " assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n", " picks = []\n", " for _ in range(num_examples):\n", " pick = random.randint(0, len(dataset)-1)\n", " while pick in picks:\n", " pick = random.randint(0, len(dataset)-1)\n", " picks.append(pick)\n", " \n", " df = pd.DataFrame(dataset[picks])\n", " display(HTML(df.to_html()))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence
0ეს ემბლემა წარმოადგენს მხიარულ, პატარა კაცს, რომელიც ჯანსაღი და ენერგიული მოქალაქეების სიმბოლოა.
1სახელი ეწოდა ნაყოფიერების რომაული ღვთაების, მერკურის დედის, მაიას მიხედვით.
2პრეზიდენტმა შერილ ბუნ-ისააკმა წევრობის ახალი წესები შემოიღო.
3სიცოცხლის მოკლე ხანგრძლივობა პოპულაციის ზრდის ნელ ტემპს განაპირობებს.
4იგი ხელმძღვანელობდა მეცნიერებათა აკადემიის გეოლოგიისა და მინერალოგიის ინსტიტუტის მინერალოგიის სექტორს.
5ნუსხურსა და მხედრულში იგი ქვედა სამ ხაზში იწერება.
6ოტელოს როლს რასელ ტომასი შეასრულებს, დეზდემონას კი – ლეა გროსეტო.
7ექსპერტთა შეფასებით ნახევარი მილიონი თევზი დაიღუპა, ზოგიერთი სახეობა კი საერთოდ გაქრა.
8შესაძლოა ადრე ხაჭოს შერევით მზადდებოდა.
9აგრეთვე დადებითი იყო კრიტიკოსების შეფასებები.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_random_elements(common_voice_train.remove_columns([\"path\", \"audio\"]), num_examples=10)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import re\n", "chars_to_remove_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�\\—\\’\\…\\–]'\n", "\n", "def remove_special_characters(batch):\n", " batch[\"sentence\"] = re.sub(chars_to_remove_regex, '', batch[\"sentence\"]).lower()\n", " return batch" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2a03c54c25734f67ac41b3740e0238ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2686 [00:00 main\n", "\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co./infinitejoy/wav2vec2-large-xls-r-300m-georgian/commit/f7ddf2b62bce10ef8de99d8446558744958780a3'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_dict[\"|\"] = vocab_dict[\" \"]\n", "del vocab_dict[\" \"]\n", "\n", "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n", "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n", "print(len(vocab_dict))\n", "\n", "import json\n", "with open('./vocab.json', 'w') as vocab_file:\n", " json.dump(vocab_dict, vocab_file)\n", " \n", "from transformers import Wav2Vec2CTCTokenizer\n", "\n", "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n", "\n", "repo_name = \"wav2vec2-large-xls-r-300m-georgian\"\n", "\n", "tokenizer.push_to_hub(repo_name)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-01-27 16:52:27-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4738 (4.6K) [text/plain]\n", "Saving to: ‘eval.py’\n", "\n", "eval.py 100%[===================>] 4.63K --.-KB/s in 0s \n", "\n", "2022-01-27 16:52:27 (21.9 MB/s) - ‘eval.py’ saved [4738/4738]\n", "\n", "total 1232584\n", "-rw-r--r-- 1 ovh ovh 399 Jan 27 12:09 vocab.json\n", "-rw-r--r-- 1 ovh ovh 294 Jan 27 12:09 tokenizer_config.json\n", "-rw-r--r-- 1 ovh ovh 695 Jan 27 12:09 special_tokens_map.json\n", "-rw-r--r-- 1 ovh ovh 23 Jan 27 12:09 added_tokens.json\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 27 16:15 checkpoint-7500\n", "drwxr-xr-x 2 ovh ovh 4096 Jan 27 16:31 checkpoint-8000\n", "-rw-r--r-- 1 ovh ovh 197 Jan 27 16:43 train_results.json\n", "-rw-r--r-- 1 ovh ovh 14654 Jan 27 16:43 trainer_state.json\n", "-rw-r--r-- 1 ovh ovh 225 Jan 27 16:44 eval_results.json\n", "-rw-r--r-- 1 ovh ovh 2033 Jan 27 16:44 config.json\n", "-rw-r--r-- 1 ovh ovh 400 Jan 27 16:44 all_results.json\n", "-rw-r--r-- 1 ovh ovh 1262083569 Jan 27 16:44 pytorch_model.bin\n", "-rw-r--r-- 1 ovh ovh 3055 Jan 27 16:44 training_args.bin\n", "-rw-r--r-- 1 ovh ovh 212 Jan 27 16:44 preprocessor_config.json\n", "-rw-r--r-- 1 ovh ovh 2484 Jan 27 16:48 README.md\n", "-rw-r--r-- 1 ovh ovh 4738 Jan 27 16:52 eval.py\n" ] } ], "source": [ "!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py\n", "!cp eval.py wav2vec2-large-xls-r-300m-georgian\n", "!ls -ltr wav2vec2-large-xls-r-300m-georgian" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.\n", "\tTry to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.\n" ] } ], "source": [ "!cd wav2vec2-large-xls-r-300m-georgian; python eval.py \\\n", " --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config ka --split test --log_outputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!cd wav2vec2-large-xls-r-300m-georgian; python eval.py \\\n", " --model_id infinitejoy/wav2vec2-large-xls-r-300m-georgian --dataset speech-recognition-community-v2/dev_data \\\n", " --config ka --split validation --chunk_length_s 10 --stride_length_s 1" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "24592b0be30e4eafb1949cf09d1c4fb4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/260 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m: 55" ] } ], "source": [ "# from transformers import AutoModelForCTC, AutoProcessor\n", "# from datasets import load_dataset\n", "\n", "# model = AutoModelForCTC.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "# processor = AutoProcessor.from_pretrained(\"infinitejoy/wav2vec2-large-xls-r-300m-bashkir\")\n", "\n", "# input_values = processor(common_voice_test[0][\"audio\"][\"array\"], return_tensors=\"pt\", sampling_rate=16_000).input_values\n", "# # input_values = input_values.to(\"cuda\")\n", "\n", "# logits = model(input_values).logits\n", "\n", "# assert logits.shape[-1] == 32, logits.shape[-1]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/ka/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "103935b56b6c4edea05219e5a86d101e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/1.99k [00:00