Spaces:

UnarineLeo
/

fill-mask-zabantu-xlm-roberta

Sleeping

App Files Files Community

UnarineLeo commited on 21 days ago

Commit

9ba3728

•

1 Parent(s): e255bad

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -13

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import streamlit as st
 from transformers import pipeline
 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-xlm-roberta')
 sample_sentences = {
     'Zulu': "Le ndoda ithi izo____ ukudla.",
     'Tshivenda': "Mufana uyo____ vhukuma.",
@@ -11,46 +13,58 @@ sample_sentences = {
     'Tsonga': "N'wana wa xisati u ____ ku tsaka."
 }
 def fill_mask_for_languages(sentences):
     results = {}
     for language, sentence in sentences.items():
         masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
         unmasked = unmasker(masked_sentence)
-        results[language] = unmasked
     return results
 def replace_mask(sentence, predicted_word):
     return sentence.replace("____", predicted_word)
 st.title("Fill Mask for Multiple Languages | Zabantu-XLM-Roberta")
 st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
 user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "\n".join(
-        f"'{lang}': '{sentence}',"
-        f"\n"
-        for lang, sentence in sample_sentences.items()
-    ))
 if st.button("Submit"):
     user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
     user_predictions = unmasker(user_masked_sentence)
     st.write("### Your Input:")
     st.write(f"Original sentence: {user_sentence}")
-    st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
     st.write("### Predictions for Sample Sentences:")
-    for language, predictions in fill_mask_for_languages(sample_sentences).items():
         original_sentence = sample_sentences[language]
-        predicted_sentence = replace_mask(sentence, predictions[0]['sequence'])
         st.write(f"Original sentence ({language}): {original_sentence}")
         st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
         st.write("=" * 80)
 css = """
 <style>
 footer {display:none !important}
@@ -98,5 +112,4 @@ div[data-testid="stMarkdownContainer"] p {
 }
 </style>
 """
-st.markdown(css, unsafe_allow_html=True)

 import streamlit as st
 from transformers import pipeline
+# Initialize the pipeline for masked language model
 unmasker = pipeline('fill-mask', model='dsfsi/zabantu-xlm-roberta')
+# Sample sentences with masked words in various languages
 sample_sentences = {
     'Zulu': "Le ndoda ithi izo____ ukudla.",
     'Tshivenda': "Mufana uyo____ vhukuma.",
     'Tsonga': "N'wana wa xisati u ____ ku tsaka."
 }
+# Function to fill mask for each language
 def fill_mask_for_languages(sentences):
     results = {}
     for language, sentence in sentences.items():
         masked_sentence = sentence.replace('____', unmasker.tokenizer.mask_token)
         unmasked = unmasker(masked_sentence)
+        results[language] = unmasked  # Store the predictions in the results dictionary
     return results
+# Function to replace the mask token with the predicted word
 def replace_mask(sentence, predicted_word):
     return sentence.replace("____", predicted_word)
+# Streamlit app
 st.title("Fill Mask for Multiple Languages | Zabantu-XLM-Roberta")
 st.write("This app predicts the missing word for sentences in Zulu, Tshivenda, Sepedi, Tswana, and Tsonga using a Zabantu BERT model.")
+# Get user input
 user_sentence = st.text_input("Enter your own sentence with a masked word (use '____'):", "\n".join(
+    f"'{lang}': '{sentence}'," for lang, sentence in sample_sentences.items()
+))
+# When user submits the input sentence
 if st.button("Submit"):
+    # Replace the placeholder with the actual mask token
     user_masked_sentence = user_sentence.replace('____', unmasker.tokenizer.mask_token)
+    # Get predictions for the user's sentence
     user_predictions = unmasker(user_masked_sentence)
     st.write("### Your Input:")
     st.write(f"Original sentence: {user_sentence}")
+    # Check the structure of predictions
+    st.write(user_predictions)  # Print to see the structure
+    # Display the top prediction for the masked token
+    if len(user_predictions) > 0:
+        st.write(f"Top prediction for the masked token: {user_predictions[0]['sequence']}")
+    # Predictions for sample sentences
     st.write("### Predictions for Sample Sentences:")
+    predictions = fill_mask_for_languages(sample_sentences)
+    for language, language_predictions in predictions.items():
         original_sentence = sample_sentences[language]
+        predicted_sentence = replace_mask(original_sentence, language_predictions[0]['token_str'])  # Use token_str for prediction
         st.write(f"Original sentence ({language}): {original_sentence}")
         st.write(f"Top prediction for the masked token: {predicted_sentence}\n")
         st.write("=" * 80)
+# Custom CSS for styling
 css = """
 <style>
 footer {display:none !important}
 }
 </style>
 """
+st.markdown(css, unsafe_allow_html=True)