File size: 4,131 Bytes
d47d862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sys
import teradataml as tdml
from tabulate import tabulate

import json


with open('conversion_config.json') as json_file:
    conversion_config = json.load(json_file)


    model_id = conversion_config["model_id"]
    number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"]
    precision_to_filename_map = conversion_config["precision_to_filename_map"]
    
    host = sys.argv[1]
    username = sys.argv[2]
    password = sys.argv[3]
    
    print("Setting up connection to teradata...")
    tdml.create_context(host = host, username = username, password = password)
    print("Done\n\n")
    
    
    print("Deploying tokenizer...")
    try:
        tdml.db_drop_table('tokenizer_table')
    except:
        print("Can't drop tokenizers table - it's not existing")
    tdml.save_byom('tokenizer',
                  'tokenizer.json',
                  'tokenizer_table')
    print("Done\n\n")
    
    print("Testing models...")
    try:
        tdml.db_drop_table('model_table')
    except:
        print("Can't drop models table - it's not existing")
    
    for precision, file_name in precision_to_filename_map.items():
        print(f"Deploying {precision} model...")
        tdml.save_byom(precision,
                      file_name,
                      'model_table')
        print(f"Model {precision} is deployed\n")
    
        print(f"Calculating embeddings with {precision} model...")
        try:
            tdml.db_drop_table('emails_embeddings_store')
        except:
            print("Can't drop embeddings table - it's not existing")
        
        tdml.execute_sql(f"""
            create volatile table emails_embeddings_store as (
                select 
                    *
            from mldb.ONNXEmbeddings(
                    on emails.emails as InputTable
                    on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION
                    on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION
               
                    using
                        Accumulate('id', 'txt') 
                        ModelOutputTensor('sentence_embedding')
                        EnableMemoryCheck('false')
                        OutputFormat('FLOAT32({number_of_generated_embeddings})')
                        OverwriteCachedModel('true')
                ) a 
        ) with data on commit preserve rows
        
        """)
        print("Embeddings calculated")
        print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...")
        tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store')
        tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3]
        
        tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3]
        
        cos_sim_pd = tdml.DataFrame.from_query(f"""
            SELECT 
                dt.target_id, 
                dt.reference_id,
                e_tgt.txt as target_txt,
                e_ref.txt as reference_txt,
                (1.0 - dt.distance) as similiarity 
            FROM
                TD_VECTORDISTANCE (
                    ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable
                    ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION
                    USING
                        TargetIDColumn('id')
                        TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
                        RefIDColumn('id')
                        RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]')
                        DistanceMeasure('cosine')
                        topk(3)
                ) AS dt
            JOIN emails.emails e_tgt on e_tgt.id = dt.target_id
            JOIN emails.emails e_ref on e_ref.id = dt.reference_id;
            """).to_pandas()
        print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid'))
        print("Done\n\n")
    
    
    tdml.remove_context()