Nos_MT-OpenNMT-gl-es / bpe-gl-es_emb.yaml

remove comments and replace original paths with generic paths

c7b0dc8 over 1 year ago

3.85 kB


	save_data: run
	## Where the vocab(s) will be written
	src_vocab: run/vocab/gl-es/bpe.vocab.src
	tgt_vocab: run/vocab/gl-es/bpe.vocab.tgt
	overwrite: True

	# Corpus opts:
	data:
	europarl:
	path_tgt: corpora/europarl/partitions/es_train.txt
	path_src: corpora/europarl_translit/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 120
	opensub:
	path_tgt: corpora/opensub/partitions/es_train.txt
	path_src: corpora/opensub_translit/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 180
	dgt:
	path_tgt: corpora/dgt/partitions/es_train.txt
	path_src: corpora/dgt_translit/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 18
	cluvi:
	path_tgt: corpora/cluvi/partitions/es_train.txt
	path_src: corpora/cluvi/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 40
	opensub-es-gl:
	path_tgt: corpora/opensub-es-gl/partitions/es_train.txt
	path_src: corpora/opensub-es-gl/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 25
	ted2020:
	path_tgt: corpora/ted2020/partitions/es_train.txt
	path_src: corpora/ted2020/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 10
	corgaback:
	path_tgt: corpora/corgaback/partitions/es_train.txt
	path_src: corpora/corgaback/partitions/gl_train.txt
	transforms: [bpe, filtertoolong]
	weight: 13
	ccmatrix:
	path_tgt: corpora/ccmatrix/es.txt
	path_src: corpora/ccmatrix/gl.txt
	transforms: [bpe, filtertoolong]
	weight: 180
	resto:
	path_tgt: corpora/resto/es.txt
	path_src: corpora/resto/gl.txt
	transforms: [bpe, filtertoolong]
	weight: 120
	opensub_2018:
	path_tgt: corpora/opensub_2018/es.txt
	path_src: corpora/opensub_2018/gl.txt
	transforms: [bpe, filtertoolong]
	weight: 25


	valid:
	path_tgt: corpora/partitions/all-es_valid.txt
	path_src: corpora/partitions_translit/all-gl_valid.txt
	transforms: [bpe, filtertoolong]

	### Transform related opts:
	#### Subword
	tgt_subword_model: ./bpe/es.code
	src_subword_model: ./bpe/gl.code
	tgt_subword_vocab: ./run/vocab/gl-es/bpe.vocab.src
	src_subword_vocab: ./run/vocab/gl-es/bpe.vocab.tgt
	src_subword_type: bpe
	tgt_subord_type: bpe

	src_subword_nbest: 1
	src_subword_alpha: 0.0
	tgt_subword_nbest: 1
	tgt_subword_alpha: 0.0

	##embeddings
	tgt_embeddings: ../embeddings/es.emb.txt
	src_embeddings: ../embeddings/gl.emb.txt

	## supported types: GloVe, word2vec
	embeddings_type: "word2vec"

	# word_vec_size need to match with the pretrained embeddings dimensions
	word_vec_size: 512


	#### Filter
	src_seq_length: 150
	tgt_seq_length: 150

	# silently ignore empty lines in the data
	skip_empty_level: silent



	# General opts
	save_model: run/model
	keep_checkpoint: 50
	save_checkpoint_steps: 10000
	average_decay: 0.0005
	seed: 1234
	report_every: 1000
	train_steps: 200000
	valid_steps: 10000

	# Batching
	queue_size: 10000
	bucket_size: 32768
	world_size: 1
	gpu_ranks: [0]
	batch_type: "tokens"
	#batch_size: 4096
	batch_size: 8192
	valid_batch_size: 64
	batch_size_multiple: 1
	max_generator_batches: 2
	accum_count: [4]
	accum_steps: [0]

	# Optimization
	model_dtype: "fp16"
	optim: "adam"
	learning_rate: 2
	#learning_rate: 0.00005
	warmup_steps: 8000
	decay_method: "noam"
	adam_beta2: 0.998
	max_grad_norm: 0
	label_smoothing: 0.1
	param_init: 0
	param_init_glorot: true
	normalization: "tokens"

	# Model
	encoder_type: transformer
	decoder_type: transformer
	position_encoding: true
	enc_layers: 6
	dec_layers: 6
	heads: 8
	rnn_size: 512
	transformer_ff: 2048
	dropout_steps: [0]
	dropout: [0.1]
	attention_dropout: [0.1]
	share_decoder_embeddings: true
	share_embeddings: false