import streamlit as st from transformers import pipeline # Initialize the pipeline for masked language model unmasker = pipeline('fill-mask', model='dsfsi/zabantu-xlm-roberta') # Sample sentences with masked words in various languages sample_sentences = { 'Zulu': "Le ndoda ithi izo____ ukudla.", 'Tshivenda': "Mufana uyo____ vhukuma.", 'Sepedi': "Mosadi o ____ pheka.", 'Tswana': "Monna o ____ tsamaya.", 'Tsonga': "N'wana wa xisati u ____ ku tsaka." } # Function to fill mask for each language def fill_mask_for_languages(sentences): results = {} for language, sentence in sentences.items(): masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token) unmasked = unmasker(masked_sentence) results[language] = unmasked # Store the predictions in the results dictionary return results # Function to replace the mask token with the predicted word def replace_mask(sentence, predicted_word): return sentence.replace("____", predicted_word) # Streamlit app st.title("Fill Mask for Multiple Languages | Zabantu-XLM-Roberta") st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.") # Get user input user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "\n".join( f"'{lang}': '{sentence}'," for lang, sentence in sample_sentences.items() )) # When user submits the input sentence if st.button("Submit"): # Replace the placeholder with the actual mask token user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token) # Get predictions for the user's sentence user_predictions = unmasker(user_masked_sentence) st.write("### Your Input:") st.write(f"Original sentence: {user_sentence}") # Check the structure of predictions st.write(user_predictions) # Print to see the structure # Display the top prediction for the masked token if len(user_predictions) > 0: st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}") # Predictions for sample sentences st.write("### Predictions for Sample Sentences:") predictions = fill_mask_for_languages(sample_sentences) for language, language_predictions in predictions.items(): original_sentence = sample_sentences[language] predicted_sentence = replace_mask(original_sentence, language_predictions[0]['token_str']) # Use token_str for prediction st.write(f"Original sentence ({language}): {original_sentence}") st.write(f"Top prediction for the masked token: {predicted_sentence}\n") st.write("=" * 80) # Custom CSS for styling css = """ """ st.markdown(css, unsafe_allow_html=True)