IMS-ToucanTTS / Preprocessing /articulatory_features.py
NorHsangPha's picture
Initial commit
de6e35f verified
# -*- coding: utf-8 -*-
# partly derived from an open-source resource provided by Papercup Technologies Limited
# Resource-Author: Marlene Staib
# Modified by Florian Lux, 2021
# Further modified by Florian Lux, 2022
"""
All phonemes in the IPA standard are supported.
zero-width characters are generally not supported, as
well as some other modifiers. Tone, stress and
lengthening are represented with placeholder dimensions,
however they need to be set manually, this conversion
from phonemes to features works on a character by
character basis. In a few cases, the place of
articulation is approximated because only one phoneme
had such a combination, which does not warrant a new
dimension.
"""
def generate_feature_lookup():
return {
'~': {'symbol_type': 'silence'},
'#': {'symbol_type': 'end of sentence'},
'?': {'symbol_type': 'questionmark'},
'!': {'symbol_type': 'exclamationmark'},
'.': {'symbol_type': 'fullstop'},
' ': {'symbol_type': 'word-boundary'},
'ɜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ə': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'mid',
'vowel_roundedness': 'unrounded',
},
'a': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded',
},
'ð': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ɛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ɪ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded',
},
'ŋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'nasal'
},
'ɔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded',
},
'ɒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'ɾ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'flap'
},
'ʃ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'θ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ʊ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central_back',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded'
},
'ʌ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded'
},
'ʒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'æ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid_open',
'vowel_roundedness': 'unrounded'
},
'b': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ʔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'plosive'
},
'd': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'e': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'f': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'ɡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'h': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'i': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'j': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'approximant'
},
'k': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'l': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'lateral-approximant'
},
'm': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'nasal'
},
'n': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'nasal'
},
'ɳ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'nasal'
},
'o': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'p': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ɹ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'approximant'
},
'r': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'trill'
},
's': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
't': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'u': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
},
'v': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'w': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'approximant'
},
'x': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'z': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
'ʀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'trill'
},
'ø': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ç': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'œ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'y': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ʏ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'rounded'
},
'ɑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'c': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'ɲ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'nasal'
},
'ɣ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'ʎ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'lateral-approximant'
},
'β': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʝ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'q': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ɭ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'lateral-approximant'
},
'ɵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ʑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ʋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'approximant'
},
'ʁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ɨ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ʂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɓ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'implosive'
},
'ʙ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'vibrant'
},
'ɗ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'implosive'
},
'ɖ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'χ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ʛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'implosive'
},
'ʟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'lateral-approximant'
},
'ɽ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'flap'
},
'ɢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɠ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'implosive'
},
'ǂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'click'
},
'ɦ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'ǁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'click'
},
'ĩ': { # identical description with i except nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded',
'consonant_manner' : 'nasal'
},
'ʍ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'fricative'
},
'ʕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɻ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'approximant'
},
'ʄ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'implosive'
},
'ũ': { # identical with u, but nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ɤ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded',
},
'ɶ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'õ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ʡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'plosive'
},
'ʈ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'ʜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ɱ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'nasal'
},
'ɯ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ǀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'click'
},
'ɸ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'click'
},
'ʐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɰ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'approximant'
},
'ɘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'ħ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɞ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'ʉ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ɴ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'nasal'
},
'ʢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ѵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'flap'
},
'ǃ': { # looks deceivingly like an exclamation mark, but it's a different unicode entry
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'click'
},
} # REMEMBER to also add the phonemes added here to the ID lookup below as the new highest ID
def get_phone_to_id():
"""
for the states of the ctc loss and dijkstra/mas in the aligner
cannot be extracted trivially from above because sets are unordered and the IDs need to be consistent
"""
phone_to_id = dict()
for index, phone in enumerate("~#?!ǃ.ɜəaðɛɪŋɔɒɾʃθʊʌʒæbʔdefghijklmnɳopɡɹrstuvwxzʀøçɐœyʏɑcɲɣʎβʝɟqɕɭɵʑʋʁɨʂɓʙɗɖχʛʟɽɢɠǂɦǁĩʍʕɻʄũɤɶõʡʈʜɱɯǀɸʘʐɰɘħɞʉɴʢѵ"):
phone_to_id[phone] = index
return phone_to_id
def get_feature_to_index_lookup():
return {
# MODIFIER
# -- stress: modified by the previous symbol
"stressed" : 0,
# -- tone: modified by the following symbol
"very-high-tone" : 1,
"high-tone" : 2,
"mid-tone" : 3,
"low-tone" : 4,
"very-low-tone" : 5,
"rising-tone" : 6,
"falling-tone" : 7,
"peaking-tone" : 8,
"dipping-tone" : 9,
# -- lengthening: modified by the following symbol
"lengthened" : 10,
"half-length" : 11,
"shortened" : 12,
# CATEGORIES
"consonant" : 13,
"vowel" : 14,
"phoneme" : 15,
# NON-SPEECH-MARKERS
"silence" : 16,
"end of sentence" : 17,
"questionmark" : 18,
"exclamationmark" : 19,
"fullstop" : 20,
"word-boundary" : 21,
# PLACE
"dental" : 22,
"postalveolar" : 23,
"velar" : 24,
"palatal" : 25,
"glottal" : 26,
"uvular" : 27,
"labiodental" : 28,
"labial-velar" : 29,
"alveolar" : 30,
"bilabial" : 31,
"alveolopalatal" : 32,
"retroflex" : 33,
"pharyngal" : 34,
"epiglottal" : 35,
# TONGUE POSITION
"central" : 36,
"back" : 37,
"front_central" : 38,
"front" : 39,
"central_back" : 40,
# MOUTH OPENNESS
"mid" : 41,
"close-mid" : 42,
"close" : 43,
"open-mid" : 44,
"close_close-mid" : 45,
"open-mid_open" : 46,
"open" : 47,
# MOUTH SHAPE
"rounded" : 48,
"unrounded" : 49,
# MANNER
"plosive" : 50,
"nasal" : 51,
"approximant" : 52,
"trill" : 53,
"flap" : 54,
"fricative" : 55,
"lateral-approximant": 56,
"implosive" : 57,
"vibrant" : 58,
"click" : 59,
# TYPE
"unvoiced" : 60,
"voiced" : 61,
}
def generate_feature_table():
ipa_to_phonemefeats = generate_feature_lookup()
feat_types = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
[feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
feat_to_val_set = dict()
for feat in feat_types:
feat_to_val_set[feat] = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
for feat in ipa_to_phonemefeats[ipa]:
feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
# print(feat_to_val_set)
value_list = set()
for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
for value in val_set:
value_list.add(value)
# print("{")
# for index, value in enumerate(list(value_list)):
# print('"{}":{},'.format(value,index))
# print("}")
value_to_index = get_feature_to_index_lookup()
phone_to_vector = dict()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
phone_to_vector[ipa] = [0] * (13 + sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]))
# there are 13 features which do not occur in the vectors, because they are context dependent and not lexical
for feat in ipa_to_phonemefeats[ipa]:
if ipa_to_phonemefeats[ipa][feat] in value_to_index:
phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
if phone_to_vector[ipa][value_to_index["phoneme"]] != 1:
# it's not a phoneme, so we give it the silence marker, regardless of what it is.
phone_to_vector[ipa][value_to_index["silence"]] = 1
for feat in feat_to_val_set:
for value in feat_to_val_set[feat]:
if value not in value_to_index:
print(f"Unknown feature value in featureset! {value}")
# print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 49")
return phone_to_vector
if __name__ == '__main__':
print(generate_feature_table())