# Author: Amir Hossein Kargaran # Date: August, 2023 # Description: This code applies MaskLID (code-switch language identification) with GlotLID, a fastText-based language identification tool. # MIT License import gradio as gr from masklid import MaskLID from huggingface_hub import hf_hub_download from fasttext.FastText import _FastText def render_metadata(): """Renders the metadata.""" html_content = """
This is the demo for MaskLID paper (ACL 2024). You can see the whole code in our GitHub. Please also note that if you increase the number of languages, you also need larger alpha and beta values. MaskLID does not add much overhead to language identification. You first fix the languages your model is limited to and then run the MaskLID code. However, in this demo, we load the model each time (that takes couple of seconds) you hit submit to ensure the results are not cached and to make it possible to change the set of languages each time. We may later change the demo code to resolve this.
""" return html_content def get_model_path(): # Download GlotLID FastText language identification model from Hugging Face Hub model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model_v3.bin") return model_path def get_masklid(): # load masklid model masklid_model = MaskLID(get_model_path()) # get all the labels labels = masklid_model.model.get_labels() labels = [l for l in labels if not l.startswith('__label__und') and not l.startswith('__label__zxx')] return masklid_model, labels def predict_codeswitch(text, top_labels=200, beta=20, alpha=3, max_lambda=3, min_length=10, min_prob=0.90, max_retry=3, alpha_step_increase=3, beta_step_increase=5): # constraints beta = top_labels if beta > top_labels else beta alpha = beta if alpha > beta else alpha # override the masklid label set masklid_model, labels = get_masklid() masklid_model.language_indices = masklid_model._compute_language_indices(labels[:top_labels]) masklid_model.labels = [masklid_model.model.get_labels()[i] for i in masklid_model.language_indices] ans = masklid_model.predict_codeswitch(text, beta=beta, alpha=alpha, max_lambda=max_lambda, min_length=min_length, min_prob=min_prob, max_retry=max_retry, alpha_step_increase=alpha_step_increase, beta_step_increase=beta_step_increase) return ans inputs = gr.Textbox(lines=2, label="Enter the text", value="bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop") parameters = { "top_labels": gr.Slider(minimum=2, maximum=len(get_masklid()[1]), step=1, value=200, label="Limit LID to X Top Languages"), "beta": gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Beta"), "alpha": gr.Slider(minimum=1, maximum=30, value=3, step=1, label="Alpha"), "max_lambda": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Iteration"), "min_length": gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Min Length"), "min_prob": gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.01, label="Min Probability"), "max_retry": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Retry In total"), "alpha_step_increase": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Alpha Step Increase"), "beta_step_increase": gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Beta Step Increase") } output = gr.JSON(label="Output") gr.Interface( fn=predict_codeswitch, inputs=[inputs, *parameters.values()], outputs=output, title="MaskLID (Code-Switch Language Identification)", description = render_metadata(), cache_examples=False ).launch()