Update pipeline.py
#1
by
alighadami77
- opened
- pipeline.py +49 -20
pipeline.py
CHANGED
@@ -1,36 +1,65 @@
|
|
1 |
# from scipy.special import softmax
|
2 |
import tensorflow as tf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
class PreTrainedPipeline():
|
5 |
def __init__(self, path):
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
x = tf.keras.layers.Dense(512, activation="LeakyReLU")(x)
|
13 |
-
x = tf.keras.layers.Dense(1024, activation="LeakyReLU")(x)
|
14 |
-
x = tf.keras.layers.Dense(2048, activation="LeakyReLU")(x)
|
15 |
-
outputs = tf.keras.layers.Dense(300, activation="tanh")(x)
|
16 |
|
17 |
-
|
|
|
18 |
|
19 |
-
model.
|
20 |
|
21 |
-
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
return [
|
27 |
-
[
|
28 |
-
{'label':
|
29 |
-
{'label':
|
30 |
-
{'label':
|
31 |
-
{'label':
|
32 |
]
|
33 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# def RevDict(sent,flag,model):
|
36 |
# """
|
|
|
1 |
# from scipy.special import softmax
|
2 |
import tensorflow as tf
|
3 |
+
from transformers import Pipeline
|
4 |
+
import tensorflow as tf
|
5 |
+
import numpy as np
|
6 |
+
import json
|
7 |
+
from hazm import *
|
8 |
+
from scipy.spatial import distance
|
9 |
+
|
10 |
|
11 |
class PreTrainedPipeline():
|
12 |
def __init__(self, path):
|
13 |
+
self.model_dir = "saved_model"
|
14 |
+
self.t2id_path = "t2id.json"
|
15 |
+
self.stopwords_path = "stopwords.txt"
|
16 |
+
self.id2h_path = "id2h.json"
|
17 |
+
self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
|
18 |
+
self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
|
21 |
+
self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
|
22 |
|
23 |
+
self.model = tf.saved_model.load(self.model_dir)
|
24 |
|
25 |
+
def __call__(self, inputs: str):
|
26 |
|
27 |
+
# Preprocess the input sentence
|
28 |
+
sentence = Normalizer().normalize(inputs)
|
29 |
+
tokens = word_tokenize(sentence)
|
30 |
+
tokens = [t for t in tokens if t not in self.stopwords]
|
31 |
+
input_ids = np.zeros((1, 20))
|
32 |
+
for i, token in enumerate(tokens):
|
33 |
+
if i >= 20:
|
34 |
+
break
|
35 |
+
input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
|
36 |
|
37 |
+
# Call the model on the input ids
|
38 |
+
embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
|
39 |
+
# Postprocess the embeddings to get the most similar words
|
40 |
+
similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
|
41 |
+
top_indices = similarities.argsort()[:10]
|
42 |
+
top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
|
43 |
+
|
44 |
+
|
45 |
return [
|
46 |
+
[
|
47 |
+
{'label': top_words[0], 'score': 0},
|
48 |
+
{'label': top_words[1], 'score': 0},
|
49 |
+
{'label': top_words[2], 'score': 0},
|
50 |
+
{'label': top_words[3], 'score': 0},
|
51 |
]
|
52 |
]
|
53 |
+
|
54 |
+
|
55 |
+
# return [
|
56 |
+
# [ # Sample output, call the model here TODO
|
57 |
+
# {'label': 'POSITIVE', 'score': 0.05},
|
58 |
+
# {'label': 'NEGATIVE', 'score': 0.03},
|
59 |
+
# {'label': 'معنی', 'score': 0.92},
|
60 |
+
# {'label': f'{inputs}', 'score': 0},
|
61 |
+
# ]
|
62 |
+
# ]
|
63 |
|
64 |
# def RevDict(sent,flag,model):
|
65 |
# """
|