|
--- |
|
datasets: |
|
- hynky/czech_news_dataset_v2 |
|
language: |
|
- cs |
|
library_name: transformers |
|
tags: |
|
- news |
|
- nlp |
|
- czech |
|
--- |
|
|
|
- A model for predicting the source of news articles |
|
## Usage: |
|
|
|
``` |
|
import re |
|
from transformers import pipeline |
|
from html import unescape |
|
from unicodedata import normalize |
|
|
|
re_multispace = re.compile(r"\s+") |
|
|
|
def normalize_text(text): |
|
if text == None: |
|
return None |
|
|
|
text = text.strip() |
|
text = text.replace("\n", " ") |
|
text = text.replace("\t", " ") |
|
text = text.replace("\r", " ") |
|
text = re_multispace.sub(" ", text) |
|
text = unescape(text) |
|
text = normalize("NFKC", text) |
|
return text |
|
|
|
|
|
model = pipeline(task="text-classification", |
|
model=f"hynky/Server", tokenizer="ufal/robeczech-base", |
|
truncation=True, max_length=512, |
|
top_k=5 |
|
) |
|
|
|
|
|
def predict(article): |
|
article = normalize_text(article) |
|
predictions = model(article) |
|
|
|
predict("Dnes v noci bude pršet.") |
|
``` |