Spaces:

emircanerol
/

diacritizeTR

Sleeping

diacritizeTR / app.py

emircanerol

Add application file

a17b609 6 months ago

2.18 kB

	import gradio as gr
	import torch
	from peft import PeftModel, PeftConfig
	from transformers import AutoModelForTokenClassification

	def test_mask(model, sample):
	"""
	Masks the padded tokens in the input.
	Args:
	data (list): List of strings.
	Returns:
	dataset (list): List of dictionaries.
	"""

	tokens = dict()

	input_tokens = [i + 3 for i in sample.encode('utf-8')]
	input_tokens.append(0) # eos token
	tokens['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64, device=model.device)

	# Create attention mask
	tokens['attention_mask'] = torch.ones_like(tokens['input_ids'], dtype=torch.int64, device=model.device)

	return tokens

	def rewrite(model, data):
	"""
	Rewrites the input text with the model.
	Args:
	model (torch.nn.Module): Model.
	data (dict): Dictionary containing 'input_ids' and 'attention_mask'.
	Returns:
	output (str): Rewritten text.
	"""

	with torch.no_grad():
	pred = torch.argmax(model(**data).logits, dim=2).squeeze(0)

	output = list() # save the indices of the characters as list of integers

	# Conversion table for Turkish characters {100: [300, 350], ...}
	en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))}

	for inp, lab in zip((data['input_ids'].squeeze(0) - 3).tolist(), pred.tolist()):
	if lab and inp in en2tr:
	# if the model predicts a diacritic, replace it with the corresponding Turkish character
	output.extend(en2tr[inp])
	elif inp >= 0: output.append(inp)
	return bytes(output).decode()

	def try_it(text):
	sample = test_mask(model, text)
	return rewrite(model, sample)


	if __name__ == '__main__':
	config = PeftConfig.from_pretrained("bite-the-byte/byt5-small-deASCIIfy-TR")
	model = AutoModelForTokenClassification.from_pretrained("google/byt5-small")
	model = PeftModel.from_pretrained(model, "bite-the-byte/byt5-small-deASCIIfy-TR")

	diacritize_app = gr.Interface(fn=try_it, inputs="text", outputs="text")
	diacritize_app.launch(share=True)