bge-small-en-v1.5 / test_teradata.py
martinhillebrandtd's picture
model bge small en v 1.5
d47d862
raw
history blame
4.13 kB
import sys
import teradataml as tdml
from tabulate import tabulate
import json
with open('conversion_config.json') as json_file:
conversion_config = json.load(json_file)
model_id = conversion_config["model_id"]
number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
precision_to_filename_map = conversion_config["precision_to_filename_map"]
host = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
print("Setting up connection to teradata...")
tdml.create_context(host = host, username = username, password = password)
print("Done\n\n")
print("Deploying tokenizer...")
try:
tdml.db_drop_table('tokenizer_table')
except:
print("Can't drop tokenizers table - it's not existing")
tdml.save_byom('tokenizer',
'tokenizer.json',
'tokenizer_table')
print("Done\n\n")
print("Testing models...")
try:
tdml.db_drop_table('model_table')
except:
print("Can't drop models table - it's not existing")
for precision, file_name in precision_to_filename_map.items():
print(f"Deploying {precision} model...")
tdml.save_byom(precision,
file_name,
'model_table')
print(f"Model {precision} is deployed\n")
print(f"Calculating embeddings with {precision} model...")
try:
tdml.db_drop_table('emails_embeddings_store')
except:
print("Can't drop embeddings table - it's not existing")
tdml.execute_sql(f"""
create volatile table emails_embeddings_store as (
select
*
from mldb.ONNXEmbeddings(
on emails.emails as InputTable
on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION
on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION
using
Accumulate('id', 'txt')
ModelOutputTensor('sentence_embedding')
EnableMemoryCheck('false')
OutputFormat('FLOAT32({number_of_generated_embeddings})')
OverwriteCachedModel('true')
) a
) with data on commit preserve rows
""")
print("Embeddings calculated")
print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...")
tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store')
tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3]
tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3]
cos_sim_pd = tdml.DataFrame.from_query(f"""
SELECT
dt.target_id,
dt.reference_id,
e_tgt.txt as target_txt,
e_ref.txt as reference_txt,
(1.0 - dt.distance) as similiarity
FROM
TD_VECTORDISTANCE (
ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable
ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION
USING
TargetIDColumn('id')
TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
RefIDColumn('id')
RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
DistanceMeasure('cosine')
topk(3)
) AS dt
JOIN emails.emails e_tgt on e_tgt.id = dt.target_id
JOIN emails.emails e_ref on e_ref.id = dt.reference_id;
""").to_pandas()
print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid'))
print("Done\n\n")
tdml.remove_context()