Spaces:

patharanor
/

asr-th

Runtime error

File size: 3,095 Bytes

79be08a
2f990e6
79be08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f990e6
 
79be08a
 
2f990e6
 
79be08a
 
 
1b3124c
 
79be08a
2f990e6
79be08a
1b3124c
79be08a
 
2f990e6
79be08a
1b3124c
 
79be08a
 
 
 
2f990e6
79be08a
 
 
2f990e6
79be08a
 
 
 
 
2f990e6
79be08a
1b3124c
2f990e6
79be08a
2f990e6
79be08a

from pythainlp.util import text_to_num, text_to_arabic_digit
from collections import deque

class ThaiWord:

    def __init__(self) -> None:
        self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
        self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
        self.word_number_specific = ['เอ็ด', 'ยี่']
        self.word_digit_specific = ['สิบ']

    def iscontains11(self, word) -> bool:
        return self.word_number_specific[0] == word[-4:] or  \
            self.word_number_specific[0] == word[0:4]

    def iscontains2x(self, word) -> bool:
        return self.word_number_specific[1] == word[0:3]

    def words_to_number(self, words) -> str:
        num = ''

        if len(words) == 1 and words[0] in self.word_digit:
            # return text if the word is unit
            num = words
        else:
            try:
                num = text_to_num("".join(words))

                # get numeric only in sentence
                if len(num) > 0:
                    num = num[0]
                    num = f' {int(num):,} '

            except Exception:
                for word in words:
                    num = f'{num}{text_to_arabic_digit(word)}'
                
                num = f' {num}'

        return num

    def pretty(self, tokens: deque) -> str:
        has_start_number = False
        number = []
        text = ''

        while len(tokens) > 0:
            word = tokens.popleft()
            if has_start_number:
                if self.is_number(word) or self.is_digit(word):
                    number.append(word)
                else:
                    text = f'{text}{self.words_to_number(number)}'
                    has_start_number = False
                    number.clear()
            
            # detect the first numeric in sentence
            if not has_start_number:
                if self.is_start_number(word):
                    has_start_number = True
                    number.append(word)
                else:
                    text = f'{text}{word}'

            if len(tokens) == 0 and len(number) > 0:
                text = f'{text}{self.words_to_number(number)}'
                    
        return text
    
    def is_start_number(self, word) -> bool:
        has_start_number = False
        if word in self.word_number or \
            word in self.word_digit or \
            self.iscontains2x(word) or  \
            self.iscontains11(word):

            has_start_number = True

        return has_start_number
    
    def is_digit(self, word) -> bool:
        has_digit = False
        if word in self.word_digit:
            has_digit = True

        return has_digit
    
    def is_number(self, word) -> bool:
        has_number = False
        if word in self.word_number or \
            word in self.word_number_specific or  \
            self.iscontains11(word):
            
            has_number = True

        return has_number