martinhillebrandtd commited on
Commit
150045e
·
1 Parent(s): 2613c42
.gitattributes CHANGED
@@ -7,6 +7,7 @@
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
 
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.json filter=lfs diff=lfs merge=lfs -text
11
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
12
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
  *.model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb9341fdaa9b357bfdd20c0346405f65eb61eaba6e9936983248e4a6db20c649
3
+ size 792
conversion_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11e34adf023f7e7d45302679b02f8fcff767d7890ee34c6cc90c649354bf6f5
3
+ size 284
convert.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import shutil
4
+
5
+ from optimum.exporters.onnx import main_export
6
+ import onnx
7
+ from onnxconverter_common import float16
8
+ import onnxruntime as rt
9
+ from onnxruntime.tools.onnx_model_utils import *
10
+ from onnxruntime.quantization import quantize_dynamic, QuantType
11
+
12
+ with open('conversion_config.json') as json_file:
13
+ conversion_config = json.load(json_file)
14
+
15
+
16
+ model_id = conversion_config["model_id"]
17
+ number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
18
+ precision_to_filename_map = conversion_config["precision_to_filename_map"]
19
+ opset = conversion_config["opset"]
20
+ IR = conversion_config["IR"]
21
+
22
+
23
+ op = onnx.OperatorSetIdProto()
24
+ op.version = opset
25
+
26
+
27
+ if not os.path.exists("onnx"):
28
+ os.makedirs("onnx")
29
+
30
+ print("Exporting the main model version")
31
+
32
+ main_export(model_name_or_path=model_id, output="./", opset=opset, trust_remote_code=True, task="feature-extraction", dtype="fp32")
33
+
34
+ if "fp32" in precision_to_filename_map:
35
+ print("Exporting the fp32 onnx file...")
36
+
37
+ shutil.copyfile('model.onnx', precision_to_filename_map["fp32"])
38
+
39
+ print("Done\n\n")
40
+
41
+ if "int8" in precision_to_filename_map:
42
+ print("Quantizing fp32 model to int8...")
43
+ quantize_dynamic("model.onnx", precision_to_filename_map["int8"], weight_type=QuantType.QInt8)
44
+ print("Done\n\n")
45
+
46
+ if "uint8" in precision_to_filename_map:
47
+ print("Quantizing fp32 model to uint8...")
48
+ quantize_dynamic("model.onnx", precision_to_filename_map["uint8"], weight_type=QuantType.QUInt8)
49
+ print("Done\n\n")
50
+
51
+ os.remove("model.onnx")
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:865e3ab074ae342f1ddfc94f88faab304a7a18e12bd126d25a570291aad5d187
3
+ size 1110158366
onnx/model_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c054838d2d2b368e2fd823a505f6647019f791760afac78ee08e373ecba2a5
3
+ size 278400127
onnx/model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85476697945c3cf9595016472590002934afa65c87698602858b6fd93d7b795
3
+ size 278400158
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835
3
+ size 964
test_local.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime as rt
2
+
3
+ from sentence_transformers.util import cos_sim
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ import transformers
7
+
8
+ import gc
9
+ import json
10
+
11
+
12
+ with open('conversion_config.json') as json_file:
13
+ conversion_config = json.load(json_file)
14
+
15
+
16
+ model_id = conversion_config["model_id"]
17
+ number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
18
+ precision_to_filename_map = conversion_config["precision_to_filename_map"]
19
+
20
+ sentences_1 = 'How is the weather today?'
21
+ sentences_2 = 'What is the current weather like today?'
22
+
23
+ print(f"Testing on cosine similiarity between sentences: \n'{sentences_1}'\n'{sentences_2}'\n\n\n")
24
+
25
+ tokenizer = transformers.AutoTokenizer.from_pretrained("./")
26
+ enc1 = tokenizer(sentences_1)
27
+ enc2 = tokenizer(sentences_2)
28
+
29
+ for precision, file_name in precision_to_filename_map.items():
30
+
31
+
32
+ onnx_session = rt.InferenceSession(file_name)
33
+ embeddings_1_onnx = onnx_session.run(None, {"input_ids": [enc1.input_ids],
34
+ "attention_mask": [enc1.attention_mask]})[1][0]
35
+
36
+ embeddings_2_onnx = onnx_session.run(None, {"input_ids": [enc2.input_ids],
37
+ "attention_mask": [enc2.attention_mask]})[1][0]
38
+
39
+ del onnx_session
40
+ gc.collect()
41
+ print(f'Cosine similiarity for ONNX model with precision "{precision}" is {str(cos_sim(embeddings_1_onnx, embeddings_2_onnx))}')
42
+
43
+
44
+
45
+
46
+ model = SentenceTransformer(model_id, trust_remote_code=True)
47
+ embeddings_1_sentence_transformer = model.encode(sentences_1, normalize_embeddings=True, trust_remote_code=True)
48
+ embeddings_2_sentence_transformer = model.encode(sentences_2, normalize_embeddings=True, trust_remote_code=True)
49
+ print('Cosine similiarity for original sentence transformer model is '+str(cos_sim(embeddings_1_sentence_transformer, embeddings_2_sentence_transformer)))
test_teradata.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import teradataml as tdml
3
+ from tabulate import tabulate
4
+
5
+ import json
6
+
7
+
8
+ with open('conversion_config.json') as json_file:
9
+ conversion_config = json.load(json_file)
10
+
11
+
12
+ model_id = conversion_config["model_id"]
13
+ number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
14
+ precision_to_filename_map = conversion_config["precision_to_filename_map"]
15
+
16
+ host = sys.argv[1]
17
+ username = sys.argv[2]
18
+ password = sys.argv[3]
19
+
20
+ print("Setting up connection to teradata...")
21
+ tdml.create_context(host = host, username = username, password = password)
22
+ print("Done\n\n")
23
+
24
+
25
+ print("Deploying tokenizer...")
26
+ try:
27
+ tdml.db_drop_table('tokenizer_table')
28
+ except:
29
+ print("Can't drop tokenizers table - it's not existing")
30
+ tdml.save_byom('tokenizer',
31
+ 'tokenizer.json',
32
+ 'tokenizer_table')
33
+ print("Done\n\n")
34
+
35
+ print("Testing models...")
36
+ try:
37
+ tdml.db_drop_table('model_table')
38
+ except:
39
+ print("Can't drop models table - it's not existing")
40
+
41
+ for precision, file_name in precision_to_filename_map.items():
42
+ print(f"Deploying {precision} model...")
43
+ tdml.save_byom(precision,
44
+ file_name,
45
+ 'model_table')
46
+ print(f"Model {precision} is deployed\n")
47
+
48
+ print(f"Calculating embeddings with {precision} model...")
49
+ try:
50
+ tdml.db_drop_table('emails_embeddings_store')
51
+ except:
52
+ print("Can't drop embeddings table - it's not existing")
53
+
54
+ tdml.execute_sql(f"""
55
+ create volatile table emails_embeddings_store as (
56
+ select
57
+ *
58
+ from mldb.ONNXEmbeddings(
59
+ on emails.emails as InputTable
60
+ on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION
61
+ on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION
62
+
63
+ using
64
+ Accumulate('id', 'txt')
65
+ ModelOutputTensor('sentence_embedding')
66
+ EnableMemoryCheck('false')
67
+ OutputFormat('FLOAT32({number_of_generated_embeddings})')
68
+ OverwriteCachedModel('true')
69
+ ) a
70
+ ) with data on commit preserve rows
71
+
72
+ """)
73
+ print("Embeddings calculated")
74
+ print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...")
75
+ tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store')
76
+ tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3]
77
+
78
+ tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3]
79
+
80
+ cos_sim_pd = tdml.DataFrame.from_query(f"""
81
+ SELECT
82
+ dt.target_id,
83
+ dt.reference_id,
84
+ e_tgt.txt as target_txt,
85
+ e_ref.txt as reference_txt,
86
+ (1.0 - dt.distance) as similiarity
87
+ FROM
88
+ TD_VECTORDISTANCE (
89
+ ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable
90
+ ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION
91
+ USING
92
+ TargetIDColumn('id')
93
+ TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
94
+ RefIDColumn('id')
95
+ RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
96
+ DistanceMeasure('cosine')
97
+ topk(3)
98
+ ) AS dt
99
+ JOIN emails.emails e_tgt on e_tgt.id = dt.target_id
100
+ JOIN emails.emails e_ref on e_ref.id = dt.reference_id;
101
+ """).to_pandas()
102
+ print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid'))
103
+ print("Done\n\n")
104
+
105
+
106
+ tdml.remove_context()
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
3
+ size 17082734
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb4021b23969fc942e96b2c4ac906d94712b4b96ef7081cb66a0ea211896d51
3
+ size 1177