fill-mask-zabantu-xlm-roberta

Running

File size: 4,874 Bytes

f5f8f9a
 
15eff6a
f5f8f9a
0802504
6cbe2dc
4d5270d
f5f8f9a
4d5270d
f5f8f9a
4d5270d
b854242
4d5270d
0ce811e
 
b854242
4d5270d
 
 
f5f8f9a
7a621b0
b92b795
7a621b0
d99c1e8
4d5270d
d99c1e8
15eff6a
d99c1e8
 
 
 
 
4d5270d
d99c1e8
b51c864
d104ff1
d99c1e8
 
5b5eeef
d99c1e8
 
 
 
 
4d5270d
d99c1e8
 
fa59459
d99c1e8
 
0716ef3
 
d99c1e8
 
 
89b629e
b3dcfe5
d99c1e8
 
 
 
b3dcfe5
d99c1e8
 
 
b3dcfe5
 
d99c1e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba3728
b51c864
d99c1e8
15eff6a
d99c1e8
b3dcfe5
 
 
 
 
 
 
 
 
 
 
d99c1e8
 
 
 
5b5eeef
d99c1e8
f5f8f9a
 
b51c864
d99c1e8
 
 
f5f8f9a
 
b3dcfe5

import streamlit as st
from transformers import pipeline
from io import StringIO

unmasker = pipeline('fill-mask', model='dsfsi/zabantu-bantu-250m')
st.stop_rerun = True
st.set_page_config(layout="wide")

def fill_mask(sentences):
    results = {}
    warnings = []
    for language, sentence in sentences.items():
        if "<mask>" in sentence:
            masked_sentence = sentence.replace('<mask>', unmasker.tokenizer.mask_token)
            unmasked = unmasker(masked_sentence)
            results[language] = unmasked
        else:
            warnings.append(f"Warning: No <mask> token found in sentence: {sentence}")
    return results, warnings

def replace_mask(sentence, predicted_word):
    return sentence.replace("<mask>", f"**{predicted_word}**")

# Set up title and description
st.title("Fill Mask | Zabantu-XLM-Roberta")
st.markdown("Zabantu-XLMR refers to a fleet of models trained on South African Bantu languages...")

# Initialize session state
if 'warnings' not in st.session_state:
    st.session_state['warnings'] = []
if 'results' not in st.session_state:
    st.session_state['results'] = {}

# Define layout
col1, col2 = st.columns(2)

with col1:
    st.markdown("### Input :clipboard:")

    select_options = ['Choose option', 'Enter text input', 'Upload a file(csv/txt)']
    sample_sentence = {
        'tshivenda': "Rabulasi wa <mask> u khou bvelela nga u lima.",
        "tsonga": "N'wana wa xisati u <mask> ku tsaka."
    }

    option_selected = st.selectbox("Select an input option:", select_options, index=0)
    input_sentences = {}

    if option_selected == 'Enter text input':
        st.session_state['warnings'].clear()  # Clear warnings before new input
        language_options = ['Choose language', 'Zulu', 'Tshivenda', 'Sepedi', 'Tswana', 'Tsonga']
        
        for i in range(5):
            language = st.selectbox(f"Select language for input {i+1}:", language_options, key=f'language_{i}')
            sentence = st.text_input(f"Enter sentence for input {i+1} (with <mask>):", key=f'sentence_{i}')
            
            # Only process filled language and sentence pairs
            if language != 'Choose language' and sentence:
                input_sentences[language.lower()] = sentence

        if st.button("Submit"):
            if input_sentences:
                results, warnings = fill_mask(input_sentences)
                st.session_state['results'] = results
                st.session_state['warnings'] = warnings
            else:
                st.warning("Please fill at least one language and sentence.")

    elif option_selected == 'Upload a file(csv/txt)':
        uploaded_file = st.file_uploader("Choose a file (one sentence per line)")
        if uploaded_file:
            stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
            sentences = stringio.read().splitlines()

            for i, sentence in enumerate(sentences):
                # Here, you might need to define how to assign a language to each sentence
                # Assuming all sentences are in the same language for simplicity
                input_sentences[f'input_{i+1}'] = sentence

            if st.button("Submit"):
                results, warnings = fill_mask(input_sentences)
                st.session_state['results'] = results
                st.session_state['warnings'] = warnings

    st.markdown("### Example")
    st.code(sample_sentence)
    if st.button("Test Example"):
        result, warnings = fill_mask(sample_sentence)
        st.session_state['results'] = result
        st.session_state['warnings'] = warnings

with col2:
    st.markdown("### Output :bar_chart:")
    
    if st.session_state['results']:
        # Use st.fragment for dynamic content
        with st.container():
            for language, predictions in st.session_state['results'].items():
                if predictions:
                    top_prediction = predictions[0]
                    predicted_word = top_prediction['token_str']
                    score = top_prediction['score'] * 100

                    # Displaying the prediction with fragment
                    st.markdown(f"**{language.capitalize()} Prediction:** {predicted_word} ({score:.2f}%)")
                    st.markdown(f"<div class='bar'><div class='bar-fill' style='width:{score}%;'></div></div>", unsafe_allow_html=True)

    if st.session_state['warnings']:
        for warning in st.session_state['warnings']:
            st.warning(warning)

# CSS for styling
css = """
<style>
footer {display:none !important;}
.bar {width: 70%; background-color: #e6e6e6; border-radius: 12px; height: 5px;}
.bar-fill {background-color: #17152e; height: 100%; border-radius: 12px;}
.container {display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;}
</style>
"""
st.markdown(css, unsafe_allow_html=True)