HugoSchtr's picture
bugfix tokenizer
0449f98 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer
# Specify the name of the model
model_name = 'ehri-ner/xlm-roberta-large-ehri-ner-all'
# Load the model from Hugging Face
ner_model = pipeline('ner', model=model_name)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def predict(text):
# Tokenize the text
tokens = tokenizer.tokenize(text)
# Use the model to predict the named entities for each token
entities = ner_model(text)
# Initialize an empty dictionary to store the results
result = {}
# Iterate over the entities
for entity in entities:
# Get the word and entity type
word = entity['word']
entity_type = entity['entity']
# If the word starts with '##', it's a subword
if word.startswith('##'):
# Remove the '##' and append the subword to the last word in the result
word = word[2:]
last_word = list(result.keys())[-1]
result[last_word + word] = result.pop(last_word)
else:
# Add the word and entity type to the result
result[word] = entity_type
return result
# Define the Gradio interface
iface = gr.Interface(fn=predict,
inputs=gr.Textbox(lines=2, placeholder='Enter text here...'),
outputs='json')
# Launch the interface
iface.launch()