Spaces:

ehri-ner
/

xlm-roberta-large-ehri-ner-all

Sleeping

bugfix tokenizer

0449f98 verified 8 months ago

1.4 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer


	# Specify the name of the model
	model_name = 'ehri-ner/xlm-roberta-large-ehri-ner-all'

	# Load the model from Hugging Face
	ner_model = pipeline('ner', model=model_name)
	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	def predict(text):
	# Tokenize the text
	tokens = tokenizer.tokenize(text)

	# Use the model to predict the named entities for each token
	entities = ner_model(text)

	# Initialize an empty dictionary to store the results
	result = {}

	# Iterate over the entities
	for entity in entities:
	# Get the word and entity type
	word = entity['word']
	entity_type = entity['entity']

	# If the word starts with '##', it's a subword
	if word.startswith('##'):
	# Remove the '##' and append the subword to the last word in the result
	word = word[2:]
	last_word = list(result.keys())[-1]
	result[last_word + word] = result.pop(last_word)
	else:
	# Add the word and entity type to the result
	result[word] = entity_type

	return result

	# Define the Gradio interface
	iface = gr.Interface(fn=predict,
	inputs=gr.Textbox(lines=2, placeholder='Enter text here...'),
	outputs='json')

	# Launch the interface
	iface.launch()