fill-mask-zabantu-xlm-roberta

Running

App Files Files Community

UnarineLeo commited on 22 days ago

Commit

7a621b0

•

1 Parent(s): d104ff1

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -27

app.py CHANGED Viewed

@@ -1,67 +1,52 @@
 import streamlit as st
 from transformers import pipeline
-# Initialize the pipeline for the fill-mask task
 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')
-# Sample sentences for different languages with placeholders for the masked word
 sample_sentences = {
-    'Zulu': "Le ndoda ithi izo____ ukudla.",  # Masked word for Zulu
-    'Tshivenda': "Mufana uyo____ vhukuma.",  # Masked word for Tshivenda
-    'Sepedi': "Mosadi o ____ pheka.",  # Masked word for Sepedi
-    'Tswana': "Monna o ____ tsamaya.",  # Masked word for Tswana
-    'Tsonga': "N'wana wa xisati u ____ ku tsaka."  # Masked word for Tsonga
 }
-# Function to perform fill-mask on sentences with the token '__' replaced
 def fill_mask_for_languages(sentences):
     results = {}
     for language, sentence in sentences.items():
-        # Replace the '____' placeholder with the model's mask token
         masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
-        # Get predictions for the masked sentence
         unmasked = unmasker(masked_sentence)
-        # Store the result for each language
         results[language] = unmasked
     return results
-# Streamlit interface
 st.title("Fill Mask for Multiple Languages | Zabantu-Bantu-250m")
 st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
-# Display the original sample sentences
-st.write("### Sample sentences:")
-for language, sentence in sample_sentences.items():
-    st.write(f"**{language}**: {sentence}")
-# User input for custom sentences
-user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "Enter sentence here...")
-# Add a submit button
 if st.button("Submit"):
-    # Prepare user input for prediction
     user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
-    # Get predictions for the user input sentence
     user_predictions = unmasker(user_masked_sentence)
-    # Display results for user input
     st.write("### Your Input:")
     st.write(f"Original sentence: {user_sentence}")
     st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
-    # Display results for sample sentences
     st.write("### Predictions for Sample Sentences:")
     for language, predictions in fill_mask_for_languages(sample_sentences).items():
         original_sentence = sample_sentences[language]
-        predicted_sentence = predictions[0]['sequence']
         st.write(f"Original sentence ({language}): {original_sentence}")
         st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
         st.write("=" * 80)
-# Custom CSS styling for Streamlit elements
 css = """
 <style>
 footer {display:none !important}
@@ -109,4 +94,5 @@ div[data-testid="stMarkdownContainer"] p {
 }
 </style>
 """
-st.markdown(css, unsafe_allow_html=True)

 import streamlit as st
 from transformers import pipeline
 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')
 sample_sentences = {
+    'Zulu': "Le ndoda ithi izo____ ukudla.",
+    'Tshivenda': "Mufana uyo____ vhukuma.",
+    'Sepedi': "Mosadi o ____ pheka.",
+    'Tswana': "Monna o ____ tsamaya.",
+    'Tsonga': "N'wana wa xisati u ____ ku tsaka."
 }
 def fill_mask_for_languages(sentences):
     results = {}
     for language, sentence in sentences.items():
         masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
         unmasked = unmasker(masked_sentence)
         results[language] = unmasked
     return results
+def replace_mask(sentence, predicted_word):
+    return sentence.replace("____", predicted_word)
 st.title("Fill Mask for Multiple Languages | Zabantu-Bantu-250m")
 st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
+user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "\n".join(sample_sentences))
 if st.button("Submit"):
     user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
     user_predictions = unmasker(user_masked_sentence)
     st.write("### Your Input:")
     st.write(f"Original sentence: {user_sentence}")
     st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
     st.write("### Predictions for Sample Sentences:")
     for language, predictions in fill_mask_for_languages(sample_sentences).items():
         original_sentence = sample_sentences[language]
+        predicted_sentence = replace_mask(sentence, predictions[0]['sequence'])
         st.write(f"Original sentence ({language}): {original_sentence}")
         st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
         st.write("=" * 80)
 css = """
 <style>
 footer {display:none !important}
 }
 </style>
 """
+st.markdown(css, unsafe_allow_html=True)