File size: 3,095 Bytes
79be08a
2f990e6
79be08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f990e6
 
79be08a
 
2f990e6
 
79be08a
 
 
1b3124c
 
79be08a
2f990e6
79be08a
1b3124c
79be08a
 
2f990e6
79be08a
1b3124c
 
79be08a
 
 
 
2f990e6
79be08a
 
 
2f990e6
79be08a
 
 
 
 
2f990e6
79be08a
1b3124c
2f990e6
79be08a
2f990e6
79be08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from pythainlp.util import text_to_num, text_to_arabic_digit
from collections import deque

class ThaiWord:

    def __init__(self) -> None:
        self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
        self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
        self.word_number_specific = ['เอ็ด', 'ยี่']
        self.word_digit_specific = ['สิบ']

    def iscontains11(self, word) -> bool:
        return self.word_number_specific[0] == word[-4:] or  \
            self.word_number_specific[0] == word[0:4]

    def iscontains2x(self, word) -> bool:
        return self.word_number_specific[1] == word[0:3]

    def words_to_number(self, words) -> str:
        num = ''

        if len(words) == 1 and words[0] in self.word_digit:
            # return text if the word is unit
            num = words
        else:
            try:
                num = text_to_num("".join(words))

                # get numeric only in sentence
                if len(num) > 0:
                    num = num[0]
                    num = f' {int(num):,} '

            except Exception:
                for word in words:
                    num = f'{num}{text_to_arabic_digit(word)}'
                
                num = f' {num}'

        return num

    def pretty(self, tokens: deque) -> str:
        has_start_number = False
        number = []
        text = ''

        while len(tokens) > 0:
            word = tokens.popleft()
            if has_start_number:
                if self.is_number(word) or self.is_digit(word):
                    number.append(word)
                else:
                    text = f'{text}{self.words_to_number(number)}'
                    has_start_number = False
                    number.clear()
            
            # detect the first numeric in sentence
            if not has_start_number:
                if self.is_start_number(word):
                    has_start_number = True
                    number.append(word)
                else:
                    text = f'{text}{word}'

            if len(tokens) == 0 and len(number) > 0:
                text = f'{text}{self.words_to_number(number)}'
                    
        return text
    
    def is_start_number(self, word) -> bool:
        has_start_number = False
        if word in self.word_number or \
            word in self.word_digit or \
            self.iscontains2x(word) or  \
            self.iscontains11(word):

            has_start_number = True

        return has_start_number
    
    def is_digit(self, word) -> bool:
        has_digit = False
        if word in self.word_digit:
            has_digit = True

        return has_digit
    
    def is_number(self, word) -> bool:
        has_number = False
        if word in self.word_number or \
            word in self.word_number_specific or  \
            self.iscontains11(word):
            
            has_number = True

        return has_number