Commit
·
150045e
1
Parent(s):
2613c42
model
Browse files- .gitattributes +1 -0
- README.md +0 -0
- config.json +3 -0
- conversion_config.json +3 -0
- convert.py +51 -0
- onnx/model.onnx +3 -0
- onnx/model_int8.onnx +3 -0
- onnx/model_uint8.onnx +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +3 -0
- test_local.py +49 -0
- test_teradata.py +106 -0
- tokenizer.json +3 -0
- tokenizer_config.json +3 -0
.gitattributes
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
|
|
10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
|
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
11 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
12 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
*.model filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb9341fdaa9b357bfdd20c0346405f65eb61eaba6e9936983248e4a6db20c649
|
3 |
+
size 792
|
conversion_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f11e34adf023f7e7d45302679b02f8fcff767d7890ee34c6cc90c649354bf6f5
|
3 |
+
size 284
|
convert.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
from optimum.exporters.onnx import main_export
|
6 |
+
import onnx
|
7 |
+
from onnxconverter_common import float16
|
8 |
+
import onnxruntime as rt
|
9 |
+
from onnxruntime.tools.onnx_model_utils import *
|
10 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
11 |
+
|
12 |
+
with open('conversion_config.json') as json_file:
|
13 |
+
conversion_config = json.load(json_file)
|
14 |
+
|
15 |
+
|
16 |
+
model_id = conversion_config["model_id"]
|
17 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
18 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
19 |
+
opset = conversion_config["opset"]
|
20 |
+
IR = conversion_config["IR"]
|
21 |
+
|
22 |
+
|
23 |
+
op = onnx.OperatorSetIdProto()
|
24 |
+
op.version = opset
|
25 |
+
|
26 |
+
|
27 |
+
if not os.path.exists("onnx"):
|
28 |
+
os.makedirs("onnx")
|
29 |
+
|
30 |
+
print("Exporting the main model version")
|
31 |
+
|
32 |
+
main_export(model_name_or_path=model_id, output="./", opset=opset, trust_remote_code=True, task="feature-extraction", dtype="fp32")
|
33 |
+
|
34 |
+
if "fp32" in precision_to_filename_map:
|
35 |
+
print("Exporting the fp32 onnx file...")
|
36 |
+
|
37 |
+
shutil.copyfile('model.onnx', precision_to_filename_map["fp32"])
|
38 |
+
|
39 |
+
print("Done\n\n")
|
40 |
+
|
41 |
+
if "int8" in precision_to_filename_map:
|
42 |
+
print("Quantizing fp32 model to int8...")
|
43 |
+
quantize_dynamic("model.onnx", precision_to_filename_map["int8"], weight_type=QuantType.QInt8)
|
44 |
+
print("Done\n\n")
|
45 |
+
|
46 |
+
if "uint8" in precision_to_filename_map:
|
47 |
+
print("Quantizing fp32 model to uint8...")
|
48 |
+
quantize_dynamic("model.onnx", precision_to_filename_map["uint8"], weight_type=QuantType.QUInt8)
|
49 |
+
print("Done\n\n")
|
50 |
+
|
51 |
+
os.remove("model.onnx")
|
onnx/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:865e3ab074ae342f1ddfc94f88faab304a7a18e12bd126d25a570291aad5d187
|
3 |
+
size 1110158366
|
onnx/model_int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69c054838d2d2b368e2fd823a505f6647019f791760afac78ee08e373ecba2a5
|
3 |
+
size 278400127
|
onnx/model_uint8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b85476697945c3cf9595016472590002934afa65c87698602858b6fd93d7b795
|
3 |
+
size 278400158
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835
|
3 |
+
size 964
|
test_local.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import onnxruntime as rt
|
2 |
+
|
3 |
+
from sentence_transformers.util import cos_sim
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
import transformers
|
7 |
+
|
8 |
+
import gc
|
9 |
+
import json
|
10 |
+
|
11 |
+
|
12 |
+
with open('conversion_config.json') as json_file:
|
13 |
+
conversion_config = json.load(json_file)
|
14 |
+
|
15 |
+
|
16 |
+
model_id = conversion_config["model_id"]
|
17 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
18 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
19 |
+
|
20 |
+
sentences_1 = 'How is the weather today?'
|
21 |
+
sentences_2 = 'What is the current weather like today?'
|
22 |
+
|
23 |
+
print(f"Testing on cosine similiarity between sentences: \n'{sentences_1}'\n'{sentences_2}'\n\n\n")
|
24 |
+
|
25 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained("./")
|
26 |
+
enc1 = tokenizer(sentences_1)
|
27 |
+
enc2 = tokenizer(sentences_2)
|
28 |
+
|
29 |
+
for precision, file_name in precision_to_filename_map.items():
|
30 |
+
|
31 |
+
|
32 |
+
onnx_session = rt.InferenceSession(file_name)
|
33 |
+
embeddings_1_onnx = onnx_session.run(None, {"input_ids": [enc1.input_ids],
|
34 |
+
"attention_mask": [enc1.attention_mask]})[1][0]
|
35 |
+
|
36 |
+
embeddings_2_onnx = onnx_session.run(None, {"input_ids": [enc2.input_ids],
|
37 |
+
"attention_mask": [enc2.attention_mask]})[1][0]
|
38 |
+
|
39 |
+
del onnx_session
|
40 |
+
gc.collect()
|
41 |
+
print(f'Cosine similiarity for ONNX model with precision "{precision}" is {str(cos_sim(embeddings_1_onnx, embeddings_2_onnx))}')
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
model = SentenceTransformer(model_id, trust_remote_code=True)
|
47 |
+
embeddings_1_sentence_transformer = model.encode(sentences_1, normalize_embeddings=True, trust_remote_code=True)
|
48 |
+
embeddings_2_sentence_transformer = model.encode(sentences_2, normalize_embeddings=True, trust_remote_code=True)
|
49 |
+
print('Cosine similiarity for original sentence transformer model is '+str(cos_sim(embeddings_1_sentence_transformer, embeddings_2_sentence_transformer)))
|
test_teradata.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import teradataml as tdml
|
3 |
+
from tabulate import tabulate
|
4 |
+
|
5 |
+
import json
|
6 |
+
|
7 |
+
|
8 |
+
with open('conversion_config.json') as json_file:
|
9 |
+
conversion_config = json.load(json_file)
|
10 |
+
|
11 |
+
|
12 |
+
model_id = conversion_config["model_id"]
|
13 |
+
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
|
14 |
+
precision_to_filename_map = conversion_config["precision_to_filename_map"]
|
15 |
+
|
16 |
+
host = sys.argv[1]
|
17 |
+
username = sys.argv[2]
|
18 |
+
password = sys.argv[3]
|
19 |
+
|
20 |
+
print("Setting up connection to teradata...")
|
21 |
+
tdml.create_context(host = host, username = username, password = password)
|
22 |
+
print("Done\n\n")
|
23 |
+
|
24 |
+
|
25 |
+
print("Deploying tokenizer...")
|
26 |
+
try:
|
27 |
+
tdml.db_drop_table('tokenizer_table')
|
28 |
+
except:
|
29 |
+
print("Can't drop tokenizers table - it's not existing")
|
30 |
+
tdml.save_byom('tokenizer',
|
31 |
+
'tokenizer.json',
|
32 |
+
'tokenizer_table')
|
33 |
+
print("Done\n\n")
|
34 |
+
|
35 |
+
print("Testing models...")
|
36 |
+
try:
|
37 |
+
tdml.db_drop_table('model_table')
|
38 |
+
except:
|
39 |
+
print("Can't drop models table - it's not existing")
|
40 |
+
|
41 |
+
for precision, file_name in precision_to_filename_map.items():
|
42 |
+
print(f"Deploying {precision} model...")
|
43 |
+
tdml.save_byom(precision,
|
44 |
+
file_name,
|
45 |
+
'model_table')
|
46 |
+
print(f"Model {precision} is deployed\n")
|
47 |
+
|
48 |
+
print(f"Calculating embeddings with {precision} model...")
|
49 |
+
try:
|
50 |
+
tdml.db_drop_table('emails_embeddings_store')
|
51 |
+
except:
|
52 |
+
print("Can't drop embeddings table - it's not existing")
|
53 |
+
|
54 |
+
tdml.execute_sql(f"""
|
55 |
+
create volatile table emails_embeddings_store as (
|
56 |
+
select
|
57 |
+
*
|
58 |
+
from mldb.ONNXEmbeddings(
|
59 |
+
on emails.emails as InputTable
|
60 |
+
on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION
|
61 |
+
on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION
|
62 |
+
|
63 |
+
using
|
64 |
+
Accumulate('id', 'txt')
|
65 |
+
ModelOutputTensor('sentence_embedding')
|
66 |
+
EnableMemoryCheck('false')
|
67 |
+
OutputFormat('FLOAT32({number_of_generated_embeddings})')
|
68 |
+
OverwriteCachedModel('true')
|
69 |
+
) a
|
70 |
+
) with data on commit preserve rows
|
71 |
+
|
72 |
+
""")
|
73 |
+
print("Embeddings calculated")
|
74 |
+
print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...")
|
75 |
+
tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store')
|
76 |
+
tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3]
|
77 |
+
|
78 |
+
tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3]
|
79 |
+
|
80 |
+
cos_sim_pd = tdml.DataFrame.from_query(f"""
|
81 |
+
SELECT
|
82 |
+
dt.target_id,
|
83 |
+
dt.reference_id,
|
84 |
+
e_tgt.txt as target_txt,
|
85 |
+
e_ref.txt as reference_txt,
|
86 |
+
(1.0 - dt.distance) as similiarity
|
87 |
+
FROM
|
88 |
+
TD_VECTORDISTANCE (
|
89 |
+
ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable
|
90 |
+
ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION
|
91 |
+
USING
|
92 |
+
TargetIDColumn('id')
|
93 |
+
TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
|
94 |
+
RefIDColumn('id')
|
95 |
+
RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
|
96 |
+
DistanceMeasure('cosine')
|
97 |
+
topk(3)
|
98 |
+
) AS dt
|
99 |
+
JOIN emails.emails e_tgt on e_tgt.id = dt.target_id
|
100 |
+
JOIN emails.emails e_ref on e_ref.id = dt.reference_id;
|
101 |
+
""").to_pandas()
|
102 |
+
print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid'))
|
103 |
+
print("Done\n\n")
|
104 |
+
|
105 |
+
|
106 |
+
tdml.remove_context()
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
|
3 |
+
size 17082734
|
tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfb4021b23969fc942e96b2c4ac906d94712b4b96ef7081cb66a0ea211896d51
|
3 |
+
size 1177
|