|
import sys |
|
import teradataml as tdml |
|
from tabulate import tabulate |
|
|
|
import json |
|
|
|
|
|
with open('conversion_config.json') as json_file: |
|
conversion_config = json.load(json_file) |
|
|
|
|
|
model_id = conversion_config["model_id"] |
|
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"] |
|
precision_to_filename_map = conversion_config["precision_to_filename_map"] |
|
|
|
host = sys.argv[1] |
|
username = sys.argv[2] |
|
password = sys.argv[3] |
|
|
|
print("Setting up connection to teradata...") |
|
tdml.create_context(host = host, username = username, password = password) |
|
print("Done\n\n") |
|
|
|
|
|
print("Deploying tokenizer...") |
|
try: |
|
tdml.db_drop_table('tokenizer_table') |
|
except: |
|
print("Can't drop tokenizers table - it's not existing") |
|
tdml.save_byom('tokenizer', |
|
'tokenizer.json', |
|
'tokenizer_table') |
|
print("Done\n\n") |
|
|
|
print("Testing models...") |
|
try: |
|
tdml.db_drop_table('model_table') |
|
except: |
|
print("Can't drop models table - it's not existing") |
|
|
|
for precision, file_name in precision_to_filename_map.items(): |
|
print(f"Deploying {precision} model...") |
|
tdml.save_byom(precision, |
|
file_name, |
|
'model_table') |
|
print(f"Model {precision} is deployed\n") |
|
|
|
print(f"Calculating embeddings with {precision} model...") |
|
try: |
|
tdml.db_drop_table('emails_embeddings_store') |
|
except: |
|
print("Can't drop embeddings table - it's not existing") |
|
|
|
tdml.execute_sql(f""" |
|
create volatile table emails_embeddings_store as ( |
|
select |
|
* |
|
from mldb.ONNXEmbeddings( |
|
on emails.emails as InputTable |
|
on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION |
|
on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION |
|
|
|
using |
|
Accumulate('id', 'txt') |
|
ModelOutputTensor('sentence_embedding') |
|
EnableMemoryCheck('false') |
|
OutputFormat('FLOAT32({number_of_generated_embeddings})') |
|
OverwriteCachedModel('true') |
|
) a |
|
) with data on commit preserve rows |
|
|
|
""") |
|
print("Embeddings calculated") |
|
print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...") |
|
tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store') |
|
tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3] |
|
|
|
tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3] |
|
|
|
cos_sim_pd = tdml.DataFrame.from_query(f""" |
|
SELECT |
|
dt.target_id, |
|
dt.reference_id, |
|
e_tgt.txt as target_txt, |
|
e_ref.txt as reference_txt, |
|
(1.0 - dt.distance) as similiarity |
|
FROM |
|
TD_VECTORDISTANCE ( |
|
ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable |
|
ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION |
|
USING |
|
TargetIDColumn('id') |
|
TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') |
|
RefIDColumn('id') |
|
RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') |
|
DistanceMeasure('cosine') |
|
topk(3) |
|
) AS dt |
|
JOIN emails.emails e_tgt on e_tgt.id = dt.target_id |
|
JOIN emails.emails e_ref on e_ref.id = dt.reference_id; |
|
""").to_pandas() |
|
print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid')) |
|
print("Done\n\n") |
|
|
|
|
|
tdml.remove_context() |