File size: 2,008 Bytes
fcb30fd 722ea14 fcb30fd bf1126c dc5b7ba 2167368 fe36855 0265bd2 722ea14 fcb30fd 722ea14 fcb30fd 722ea14 fcb30fd 722ea14 bf1126c 722ea14 bf1126c 722ea14 3e9f5ce 8f152eb 486d47e 722ea14 d22d58c 1ec5925 bf1126c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance
class PreTrainedPipeline():
def __init__(self, path):
self.model_dir = path + "/saved_model"
self.t2id_path = path + "/t2id.json"
self.id2h_path = path + "/id2h.json"
self.stopwords_path = path + "/stopwords.txt"
self.comparison_matrix_path = path + "/comparison_matrix.npz"
self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
self.model = tf.saved_model.load(self.model_dir)
def __call__(self, inputs: str):
# Preprocess the input sentence
sentence = Normalizer().normalize(inputs)
tokens = word_tokenize(sentence)
tokens = [t for t in tokens if t not in self.stopwords]
input_ids = np.zeros((1, 20))
for i, token in enumerate(tokens):
if i >= 20:
break
input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
# Call the model on the input ids
embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
# Postprocess the embeddings to get the most similar words
similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
top_indices = similarities.argsort()[:10]
top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
logits = -8*np.array(similarities[top_indices])
softmax_probs = tf.nn.softmax(logits).numpy()
top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
return [
[{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
]
|