NusaBERT / model.py
StevenLimcorn's picture
Modified Document Search with 10000 samples, outputs accordion
860f760
raw
history blame
5.86 kB
from utils import (
text_analysis_interface,
token_classification_interface,
search_interface,
text_interface,
SentenceSimilarity,
)
from transformers import pipeline
models = {
"Text Analysis": {
"title": "# Text Analysis",
"examples": [
"Allianz adalah persuhaan asuransi yang di dirikan pada tanggal February 5, 1890 di Berlin, Jerman.",
"Restaurant ini sangat tidak enak. Enakan Pizza Hut.",
"Kacau lu ngerusakin rumah orang. Nih rumah yang punya Pak Presiden Jokowi.",
],
"output_label": [
"Sentiment Analysis",
"Emotion Classifier",
"POS Tagging",
"NER Tagging",
],
"desc": "A tool to showcase the full capabilities of text analysis LazarusNLP has to offer.",
"interface": text_analysis_interface,
"pipe": [
pipeline(
"text-classification",
model="w11wo/indonesian-roberta-base-sentiment-classifier",
tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
),
pipeline(
"text-classification",
model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
),
pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
],
},
"Document Search": {
"title": "# Document Search ๐Ÿ”",
"examples": ["Stadion bola Indonesia.", "Rusia dan Serbia", "Politik."],
"output_label": "Top 5 related documents",
"desc": "A semantic search tool to get the most related documents ๐Ÿ“– based on user's query.",
"interface": search_interface,
"pipe": SentenceSimilarity("LazarusNLP/all-indobert-base-v2", "sample.json"),
"top_k": 5,
},
"Sentiment Analysis": {
"title": "Sentiment Analysis",
"examples": [
"saya kecewa karena pengeditan biodata penumpang dilakukan by sistem tanpa konfirmasi dan solusi permasalahan nya pun dianggap sepele karena dibiarkan begitu saja sedang pelayanan pelanggan yang sudah berkali-berkali dihubungi pun hanya seperti mengulur waktu.",
"saya sudah transfer ratusan ribu dan sesuai nominal transfer. tapi tiket belum muncul juga. harus diwaspadai ini aplikasi ini.",
"keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
],
"output_label": "Sentiment Analysis",
"desc": "A sentiment-text-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
"interface": text_interface,
"pipe": pipeline(
"text-classification",
model="w11wo/indonesian-roberta-base-sentiment-classifier",
tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
),
},
"Emotion Detection": {
"title": "Emotion Classifier",
"examples": [
"iya semoga itu karya terbaik mu adalah skripsi mu dan lucua2n mu tapi harapan aku dari kamu adalah kesembuhanmu nold",
"saya ganteng, kalau tidak-suka mati saja kamu",
"Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
],
"output_label": "Emotion Classifier",
"desc": "An emotion classifier based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's EmoT dataset",
"interface": text_interface,
"pipe": pipeline(
"text-classification",
model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
),
},
# "summarization": {
# "examples": [],
# "desc": "This model is a fine-tuned version of LazarusNLP/IndoNanoT5-base on the indonlg dataset.",
# },
"POS Tagging": {
"title": "POS Tagging",
"examples": [
"iya semoga itu karya terbaik mu adalah skripsi mu dan lucua2n mu tapi harapan aku dari kamu adalah kesembuhanmu nold",
"saya ganteng, kalau tidak-suka mati saja kamu",
"Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
],
"output_label": "POS Tagging",
"desc": "A part-of-speech token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
"interface": token_classification_interface,
"pipe": pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
},
"NER Tagging": {
"title": "NER Tagging",
"examples": [
"Paris adalah ibukota dari negara Prancis.",
"Kuasa hukum teamster berasal dari Edmonton.",
"Jakarta, Indonesia akan menjadi bagian salah satu tempat yang akan didatangi.",
],
"output_label": "NER Tagging",
"desc": "A NER Tagging token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's NERP dataset consisting of tag-labelled news.",
"interface": token_classification_interface,
"pipe": pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
},
}