Spaces:
Runtime error
Runtime error
akuysal
commited on
Commit
•
a8dbb61
0
Parent(s):
Duplicate from akuysal/SMS-spam-Turkish-sklearn
Browse files- .gitattributes +34 -0
- LinearSVC_SMS_spam_TR.pickle +3 -0
- README.md +22 -0
- app.py +52 -0
- requirements.txt +3 -0
- tfidf_vectorizer_TR.pickle +3 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
LinearSVC_SMS_spam_TR.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e1b32d1f4716a7c48facea2b8630b897be52618b461cbb2bb4f20f34b9df52f
|
3 |
+
size 23303
|
README.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: SMS Spam Turkish Scikit-Learn
|
3 |
+
emoji: 🌖
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: green
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.17.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: openrail
|
11 |
+
duplicated_from: akuysal/SMS-spam-Turkish-sklearn
|
12 |
+
---
|
13 |
+
|
14 |
+
ENGLISH
|
15 |
+
The dataset used in the study "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." is employed for training. The success ratio for Linear SVM Classifier is 0.9880 in terms of Macro-F1 when 10% of the dataset was used for testing.
|
16 |
+
The dataset is composed of SPAM and LEGITIMATE sms data.
|
17 |
+
|
18 |
+
TÜRKÇE
|
19 |
+
Bu çalışmada "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." başlıklı çalışmadaki veri seti kullanılmıştır. Linear SVM sınıflandırıcı için başarı oranı, veri setinin %10'u test için kullanıldığında Makro-F1 açısından 0,9880'dir.
|
20 |
+
Veri seti, SPAM ve LEGITIMATE kısa mesaj verilerinden oluşmaktadır.
|
21 |
+
|
22 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from TurkishStemmer import TurkishStemmer
|
3 |
+
import string
|
4 |
+
# import for loading python objects (scikit-learn models)
|
5 |
+
import pickle
|
6 |
+
import nltk
|
7 |
+
from nltk.data import load
|
8 |
+
import streamlit as st
|
9 |
+
import sklearn
|
10 |
+
|
11 |
+
nltk.download('punkt')
|
12 |
+
trans_table = {ord(c): None for c in string.punctuation + string.digits}
|
13 |
+
|
14 |
+
def custom_tokenizer_with_Turkish_stemmer(text):
|
15 |
+
# tokenize text
|
16 |
+
# tokens = text.split(" ")
|
17 |
+
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
|
18 |
+
print(tokens)
|
19 |
+
stems = [stemmerTR.stem(item.lower()) for item in tokens]
|
20 |
+
return stems
|
21 |
+
|
22 |
+
def predictSMSdata(test_text):
|
23 |
+
categories = ["legitimate", "spam"]
|
24 |
+
categories.sort()
|
25 |
+
|
26 |
+
# load model
|
27 |
+
filename1 = "LinearSVC_SMS_spam_TR.pickle"
|
28 |
+
file_handle1 = open(filename1, "rb")
|
29 |
+
classifier = pickle.load(file_handle1)
|
30 |
+
file_handle1.close()
|
31 |
+
|
32 |
+
# load tfidf_vectorizer for transforming test text data
|
33 |
+
filename2 = "tfidf_vectorizer_TR.pickle"
|
34 |
+
file_handle2 = open(filename2, "rb")
|
35 |
+
tfidf_vectorizer = pickle.load(file_handle2)
|
36 |
+
file_handle2.close()
|
37 |
+
|
38 |
+
test_list=[test_text]
|
39 |
+
tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
|
40 |
+
predicted = classifier.predict(tfidf_vectorizer_vectors_test)
|
41 |
+
print(categories[predicted[0]])
|
42 |
+
return categories[predicted[0]]
|
43 |
+
|
44 |
+
stemmerTR = TurkishStemmer()
|
45 |
+
|
46 |
+
# adding the text that will show in the text box
|
47 |
+
default_value = "Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder"
|
48 |
+
text = st.text_area("enter some text!", default_value)
|
49 |
+
if text:
|
50 |
+
out = predictSMSdata(text)
|
51 |
+
st.write("The category of SMS = " + out.upper())
|
52 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
TurkishStemmer==1.3
|
2 |
+
scikit-learn>=1.1
|
3 |
+
nltk
|
tfidf_vectorizer_TR.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd7ad6fcbd377d3025072502492b36208d32dba87ba4d73bd86171c48b74ba33
|
3 |
+
size 82481
|