Spaces:
Runtime error
Runtime error
File size: 6,559 Bytes
2bae308 19d0c71 1f738ee 19d0c71 ebdd4c1 19d0c71 1f738ee b7a58d1 19d0c71 1f738ee 19d0c71 1f738ee 19d0c71 69c01b7 eaa911e 19d0c71 69c01b7 8371165 69c01b7 72f19e8 19d0c71 1edc55c 19d0c71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import streamlit as st
import json
from urllib.request import urlopen
from thefuzz import fuzz
from itertools import combinations
from keras_transformer import get_model, decode
####################################################################################################
# FUNCTIONS
def search_fit(word, data, threshold=50, fraction=2/3):
# Esta función se puede usar para n palabras, basta con quitar los espacios
# entre palabras
target = ''
original = ''
best_score = 0
for item in data.keys():
for i in range(len(data[item])):
data_item = data[item][i].replace(' ', '')
score = fuzz.ratio(word, data_item)
if score>best_score and score>=threshold and len(data_item)>=fraction*len(word) and len(data_item)<=len(word)/fraction:
best_score = score
target = item
original = data_item
return target, best_score, original
def find_longest_phrase(data):
biggest_len = max([max([len(data[item][i].split()) for i in range(len(data[item]))]) for item in data.keys()])
return biggest_len
def create_tuples(sample_list, tuple_size):
tuple_list = [tuple([i+j for j in range(tuple_size)]) \
for i in range(len(sample_list)-tuple_size+1)]
#print(tuple_list)
return tuple_list
# OJO: CAMBIAR LA FUNCION COMBINATION POR ALGO QUE HAGA PERMUTACIONES CICLICAS
def make_translation(transcription, data, threshold=50, fraction=2/3):
# To set limits for comparison size
data_len = find_longest_phrase(data)
transcription_len = len(transcription.split())
biggest_len = min(data_len, transcription_len)
# To get the best translation given a phrase
index_transcription = list(range(transcription_len))
index_translation = list(range(transcription_len))
translation_dict = {}
translation = transcription#.copy()
transcription_split = transcription.split()
for i in range(1, 0, -1):
# Match comparisons
if i>1:
translation_dict.update({combination: search_fit(''.join(transcription_split[combination[0]:combination[-1]+1]), data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
else:
translation_dict.update({combination: search_fit(transcription_split[combination[0]], data, threshold, fraction) for combination in create_tuples(transcription_split, i)})
# Get the best translation priorizing the longest phrases
for combination in create_tuples(transcription_split, i): # AQUI SE PUEDE MEJORAR LA BÚSQUEDA, PRIORIZANDO POR MAYOR SCORE EN LUGAR DE POR ORDEN SECUENCIAL
clear_index = min([1*(item in index_translation) for item in combination]) # 1 if all indexes are free
if clear_index and i>1 and translation_dict[combination][1]>threshold:
taken = False
translation_split = translation.split()
for number, word in enumerate(translation_split):
if number in combination:
if not taken:
if len(translation_dict[combination][0].split())>1:
translation_split[number] = '-'.join(translation_dict[combination][0])
else:
translation_split[number] = translation_dict[combination][0]
taken = True
else:
translation_split[number] = '<>'
translation = ' '.join(translation_split)
index_translation = [item if item not in combination else 0 for item in index_translation]
elif index_translation[combination[0]]!=0 and i==1 and translation_dict[combination][1]>threshold:
taken = False
translation_split = translation.split()
for number, word in enumerate(translation_split):
if number in combination:
if not taken:
if len(translation_dict[combination][0].split())>1:
translation_split[number] = '-'.join(translation_dict[combination][0])
else:
translation_split[number] = translation_dict[combination][0]
taken = True
else:
translation_split[number] = '<>'
translation = ' '.join(translation_split)
index_translation = [item if item not in combination else 0 for item in index_translation]
return translation.replace('-', ' ').replace('<>', '').replace(' ', ' ').replace(' ', ' ').strip()
def remover(my_string = ""):
for item in my_string:
if item not in values:
my_string = my_string.replace(item, "")
return my_string
def translate(oracion, model):
sentence = oracion[:] # make_translation(oracion.strip().lower(), dictionary, threshold=90, fraction=4/5) #
sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
tr_input = [list(map(lambda x: source_token_dict[x] if x in source_token_dict.keys() else source_token_dict['<UNK>'], tokens)) for tokens in sentence_tokens][0]
decoded = decode(
model,
tr_input,
start_token = target_token_dict['<START>'],
end_token = target_token_dict['<END>'],
pad_token = target_token_dict['<PAD>']
)
return ' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))
####################################################################################################
# MAIN APP
path_dict = 'https://huggingface.co./spaces/gilesitorr/Nahuatl2Spanish/raw/main/Dictionaries/'
response = urlopen(path_dict+'uncased_tokens_pretrained.json')
source_token_dict = json.loads(response.read())
target_token_dict = source_token_dict.copy()
response = urlopen(path_dict+'uncased_tokens_inv_pretrained.json')
target_token_dict_inv = json.loads(response.read())
target_token_dict_inv = {int(k): v for k,v in target_token_dict_inv.items()}
response = urlopen(path_dict+'nah_es.json')
dictionary = json.loads(response.read())
model = get_model(
token_num = max(len(source_token_dict),len(target_token_dict)),
embed_dim = 256,
encoder_num = 2,
decoder_num = 2,
head_num = 32,
hidden_dim = 2048,
dropout_rate = 0.1,
use_same_embed = False,
)
from keras.utils.data_utils import get_file
path_model = 'https://huggingface.co./spaces/gilesitorr/Nahuatl2Spanish/resolve/main/Models/'
filename = path_model+'uncased_translator_nahuatl2espanol+hybrid.h5'
weights_path = get_file(
'.././model.h5',
filename)
model.load_weights(weights_path)
values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
text = st.text_area('Escriba una frase a traducir: ')
if text:
out = translate(remover(text.lower()), model)
st.text(out) |