File size: 6,535 Bytes
2bae308
19d0c71
1f738ee
19d0c71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f738ee
 
b7a58d1
19d0c71
1f738ee
 
 
19d0c71
1f738ee
 
19d0c71
 
 
 
 
 
 
 
 
 
 
 
69c01b7
 
19d0c71
 
69c01b7
 
 
19d0c71
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
import json
from urllib.request import urlopen
from thefuzz import fuzz
from itertools import combinations
from keras_transformer import get_model, decode

####################################################################################################
# FUNCTIONS
def search_fit(word, data, threshold=50, fraction=2/3):
  # Esta función se puede usar para n palabras, basta con quitar los espacios
  # entre palabras
  target = ''
  original = ''
  best_score = 0
  
  for item in data.keys():
    for i in range(len(data[item])):
      data_item = data[item][i].replace(' ', '')
      score = fuzz.ratio(word, data_item)
      if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
        best_score = score
        target = item
        original = data_item

  return target, best_score, original

def find_longest_phrase(data):
  biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
  return biggest_len

def create_tuples(sample_list, tuple_size):
  tuple_list = [tuple([i+j for j in range(tuple_size)]) \
                for i in range(len(sample_list)-tuple_size+1)]
  #print(tuple_list)
  return tuple_list

# OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
def make_translation(transcription, data, threshold=50, fraction=2/3):
  

  # To set limits for comparison size
  data_len = find_longest_phrase(data)
  transcription_len = len(transcription.split())
  biggest_len = min(data_len, transcription_len)

  # To get the best translation given a phrase
  index_transcription = list(range(transcription_len))
  index_translation = list(range(transcription_len))

  translation_dict = {}
  translation = transcription#.copy()
  transcription_split = transcription.split()

  for i in range(1, 0, -1):
    # Match comparisons
    if i>1:
      translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
    else:
      translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
    
    # Get the best translation priorizing the longest phrases
    for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE  POR ORDEN SECUENCIAL
      clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
      if clear_index and i>1 and translation_dict[combination][1]>threshold:
        taken = False
        translation_split = translation.split()
        for number, word in enumerate(translation_split):
          if number in combination:
            if not taken:
              if len(translation_dict[combination][0].split())>1:
                translation_split[number] = '-'.join(translation_dict[combination][0])
              else:
                translation_split[number] = translation_dict[combination][0]
              taken = True
            else:
              translation_split[number] = '<>'
        translation = ' '.join(translation_split)

        index_translation = [item if item not in combination else 0 for item in index_translation]

      elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
        taken = False
        translation_split = translation.split()
        for number, word in enumerate(translation_split):
          if number in combination:
            if not taken:
              if len(translation_dict[combination][0].split())>1:
                translation_split[number] = '-'.join(translation_dict[combination][0])
              else:
                translation_split[number] = translation_dict[combination][0]
              taken = True
            else:
              translation_split[number] = '<>'
        translation = ' '.join(translation_split)
        index_translation = [item if item not in combination else 0 for item in index_translation]
  
  return translation.replace('-', ' ').replace('<>', '').replace('  ', ' ').replace('  ', ' ').strip()


def remover(my_string = ""):
  for item in my_string:
    if item not in values:
      my_string = my_string.replace(item, "")
  return my_string
  
def translate(oracion, model):
  sentence = make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) # oracion[:] # 
  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
  tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
  decoded = decode(
      model, 
      tr_input, 
      start_token = target_token_dict['<START>'],
      end_token = target_token_dict['<END>'],
      pad_token = target_token_dict['<PAD>']
  )

  return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
  
####################################################################################################
# MAIN APP
path_dict = 'https://huggingface.co./spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'

response = urlopen(path_dict+'uncased_tokens_pretrained.json')
source_token_dict = json.loads(response.read())
target_token_dict = source_token_dict.copy()

response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
target_token_dict_inv = json.loads(response.read())
target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}

response = urlopen(path_dict+'nah_es.json')
dictionary = json.loads(response.read())

model = get_model(
      token_num = max(len(source_token_dict),len(target_token_dict)),
      embed_dim = 256,
      encoder_num = 2,
      decoder_num = 2,
      head_num = 32,
      hidden_dim = 2048,
      dropout_rate = 0.1,
      use_same_embed = False,
)

from keras.utils.data_utils import get_file

path_model = 'https://huggingface.co./spaces/gilesitorr/Nahuatl2Spanish/raw/main/Models/'
filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5'
weights_path = get_file(
            'model',
            filename)
model.load_weights(filename)

values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
text = st.text_area('Escriba una frase a traducir: ')
if text:
  out = traducir(remover(text.lower()))
  st.text(out)