Batch inference?
#35
by
dgaff
- opened
Hey all! Excited to use this model - would there be any easy way to extend this to tokenize/predict in batch?
Answered my own question after a bit!
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
# Batch of texts
texts = [
"Covid cases are increasing fast!",
"Vaccines are effective against severe illness.",
"The weather is lovely today.",
"Stock markets are crashing again!"
]
# Preprocess all texts
preprocessed_texts = [preprocess(text) for text in texts]
# Tokenize inputs as a batch
encoded_inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, return_tensors='pt').to(device)
# Run inference on the batch
outputs = model(**encoded_inputs)
logits = outputs.logits.detach().numpy()
# Apply softmax to get probabilities for each class
probs = np.array([softmax(logit) for logit in logits])
# Print labels and scores for each input
for idx, text in enumerate(preprocessed_texts):
print(f"Text: {text}")
scores = probs[idx]
ranking = np.argsort(scores)[::-1] # Sort descending
for i in range(scores.shape[0]):
label = config.id2label[ranking[i]]
score = scores[ranking[i]]
print(f" {i+1}) {label}: {np.round(float(score), 4)}")
print("-" * 50)
dgaff
changed discussion status to
closed