gilesitorr commited on
Commit
19d0c71
·
1 Parent(s): 1029926

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import stramlit as st
2
+ import json
3
+ from thefuzz import fuzz
4
+ from itertools import combinations
5
+ from keras_transformer import get_model, decode
6
+
7
+ ####################################################################################################
8
+ # FUNCTIONS
9
+ def search_fit(word, data, threshold=50, fraction=2/3):
10
+ # Esta función se puede usar para n palabras, basta con quitar los espacios
11
+ # entre palabras
12
+ target = ''
13
+ original = ''
14
+ best_score = 0
15
+
16
+ for item in data.keys():
17
+ for i in range(len(data[item])):
18
+ data_item = data[item][i].replace(' ', '')
19
+ score = fuzz.ratio(word, data_item)
20
+ if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
21
+ best_score = score
22
+ target = item
23
+ original = data_item
24
+
25
+ return target, best_score, original
26
+
27
+ def find_longest_phrase(data):
28
+ biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
29
+ return biggest_len
30
+
31
+ def create_tuples(sample_list, tuple_size):
32
+ tuple_list = [tuple([i+j for j in range(tuple_size)]) \
33
+ for i in range(len(sample_list)-tuple_size+1)]
34
+ #print(tuple_list)
35
+ return tuple_list
36
+
37
+ # OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
38
+ def make_translation(transcription, data, threshold=50, fraction=2/3):
39
+
40
+
41
+ # To set limits for comparison size
42
+ data_len = find_longest_phrase(data)
43
+ transcription_len = len(transcription.split())
44
+ biggest_len = min(data_len, transcription_len)
45
+
46
+ # To get the best translation given a phrase
47
+ index_transcription = list(range(transcription_len))
48
+ index_translation = list(range(transcription_len))
49
+
50
+ translation_dict = {}
51
+ translation = transcription#.copy()
52
+ transcription_split = transcription.split()
53
+
54
+ for i in range(1, 0, -1):
55
+ # Match comparisons
56
+ if i>1:
57
+ translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
58
+ else:
59
+ translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
60
+
61
+ # Get the best translation priorizing the longest phrases
62
+ for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL
63
+ clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
64
+ if clear_index and i>1 and translation_dict[combination][1]>threshold:
65
+ taken = False
66
+ translation_split = translation.split()
67
+ for number, word in enumerate(translation_split):
68
+ if number in combination:
69
+ if not taken:
70
+ if len(translation_dict[combination][0].split())>1:
71
+ translation_split[number] = '-'.join(translation_dict[combination][0])
72
+ else:
73
+ translation_split[number] = translation_dict[combination][0]
74
+ taken = True
75
+ else:
76
+ translation_split[number] = '<>'
77
+ translation = ' '.join(translation_split)
78
+
79
+ index_translation = [item if item not in combination else 0 for item in index_translation]
80
+
81
+ elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
82
+ taken = False
83
+ translation_split = translation.split()
84
+ for number, word in enumerate(translation_split):
85
+ if number in combination:
86
+ if not taken:
87
+ if len(translation_dict[combination][0].split())>1:
88
+ translation_split[number] = '-'.join(translation_dict[combination][0])
89
+ else:
90
+ translation_split[number] = translation_dict[combination][0]
91
+ taken = True
92
+ else:
93
+ translation_split[number] = '<>'
94
+ translation = ' '.join(translation_split)
95
+ index_translation = [item if item not in combination else 0 for item in index_translation]
96
+
97
+ return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip()
98
+
99
+
100
+ def remover(my_string = ""):
101
+ for item in my_string:
102
+ if item not in values:
103
+ my_string = my_string.replace(item, "")
104
+ return my_string
105
+
106
+ def translate(oracion, model):
107
+ sentence = make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) # oracion[:] #
108
+ sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
109
+ tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
110
+ decoded = decode(
111
+ model,
112
+ tr_input,
113
+ start_token = target_token_dict['<START>'],
114
+ end_token = target_token_dict['<END>'],
115
+ pad_token = target_token_dict['<PAD>']
116
+ )
117
+
118
+ return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
119
+
120
+ ####################################################################################################
121
+ # MAIN APP
122
+ path_dict = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'
123
+
124
+ import json
125
+ with open(path_dict+'uncased_tokens_pretrained.json', 'r') as f:
126
+ source_token_dict = json.load(f)
127
+
128
+ with open(path_dict+'uncased_tokens_inv_pretrained.json', 'r') as f:
129
+ target_token_dict_inv = json.load(f)
130
+ target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}
131
+
132
+ with open(path_dict+'nah_es.json', 'b') as f:
133
+ dictionary = json.load(f)
134
+
135
+ model = get_model(
136
+ token_num = max(len(source_token_dict),len(target_token_dict)),
137
+ embed_dim = 256,
138
+ encoder_num = 2,
139
+ decoder_num = 2,
140
+ head_num = 32,
141
+ hidden_dim = 2048,
142
+ dropout_rate = 0.1,
143
+ use_same_embed = False,
144
+ )
145
+
146
+ path_model = 'https://huggingface.co/spaces/gilesitorr/Nahuatl2Spanish/raw/main/Models/'
147
+ filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5'
148
+ model.load_weights(filename)
149
+
150
+ values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
151
+ text = st.text_area('Escriba una frase a traducir: ')
152
+ if text:
153
+ out = traducir(remover(text.lower()))
154
+ st.text(out)