Batch inference?

#35
by dgaff - opened

Hey all! Excited to use this model - would there be any easy way to extend this to tokenize/predict in batch?

Answered my own question after a bit!

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

# Batch of texts
texts = [
    "Covid cases are increasing fast!",
    "Vaccines are effective against severe illness.",
    "The weather is lovely today.",
    "Stock markets are crashing again!"
]

# Preprocess all texts
preprocessed_texts = [preprocess(text) for text in texts]

# Tokenize inputs as a batch
encoded_inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, return_tensors='pt').to(device)

# Run inference on the batch
outputs = model(**encoded_inputs)
logits = outputs.logits.detach().numpy()

# Apply softmax to get probabilities for each class
probs = np.array([softmax(logit) for logit in logits])

# Print labels and scores for each input
for idx, text in enumerate(preprocessed_texts):
    print(f"Text: {text}")
    scores = probs[idx]
    ranking = np.argsort(scores)[::-1]  # Sort descending
    for i in range(scores.shape[0]):
        label = config.id2label[ranking[i]]
        score = scores[ranking[i]]
        print(f"  {i+1}) {label}: {np.round(float(score), 4)}")
    print("-" * 50)
dgaff changed discussion status to closed

Sign up or log in to comment