File size: 5,012 Bytes
ecdea0f
2619b03
3004443
2619b03
3004443
c96e9d6
4132514
 
ecdea0f
3004443
 
71667b3
6005876
 
ecdea0f
f248e14
3004443
 
ecdea0f
3004443
 
 
2619b03
ce71282
 
 
 
 
 
 
 
 
 
 
 
 
 
0c1e501
ce71282
 
 
 
 
 
ecdea0f
 
 
 
 
 
 
 
 
 
 
3004443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2619b03
3004443
 
 
 
 
 
 
 
 
ce71282
3004443
6005876
 
 
 
 
 
ce71282
 
 
 
71667b3
2619b03
71667b3
2619b03
71667b3
2619b03
 
71667b3
3004443
71667b3
3004443
 
ce71282
 
3004443
71667b3
3004443
6005876
 
 
 
 
 
 
 
 
ce71282
 
 
 
 
 
 
2619b03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

#This is a quick evaluation on a few cases

# base
# large
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model = AlbertModel.from_pretrained("albert-base-v2")
#'sentence-transformers/paraphrase-albert-base-v2'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
model_name='output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
model_name='output/training_OnlineConstrativeLoss-2023-03-17_23-15-52'
model_sbert = SentenceTransformer(model_name)


def get_sbert_embedding(input_text):
    embedding = model_sbert.encode(input_text)
    return embedding.tolist()

a1 = "65 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
a2 = "112 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
a3 = "1677 NJ-27 #2, EDISON, NJ 08817"
a4 = "5078 S MARYLAND PKWY, LAS VEGAS, NV 89119"
a5 = "65 MOUNTAIN BOULEVARD EXT, WARREN, NJ 07059"
a6 = "123 BROAD ST, NEW YORK, NY, 10304-2345"
a7 = "440 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
a8 = "200 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
a8x= "87 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
#a9 = "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
a14="257 37 US ROUTE 11, EVANS MILLS, NY 13637"
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"

a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"

a19 = "THE PAVILION AT QUEENS FOR REHABILITAION AND NURSING 36-17 PARSONS BOULEVARD, FLUSHING, NY 11354"
a20 = "136-17 39TH AVENUE, 4TH FLOOR, SUITE CF-E, FLUSHING, NY 11354"
a21="WISDOM MEDICAL P.C., 136-20 38 TH AVE  6E, FLUSHING, NY 11354"

encoded_input = tokenizer(a21, return_tensors='pt')
input_ids = encoded_input.input_ids
input_num_tokens = input_ids.shape[1]
print(input_num_tokens)
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
print( "Tokens : " + ' '.join(list_of_tokens))
#def get_embedding(input_text):
#    encoded_input = tokenizer(input_text, return_tensors='pt')
#    input_ids = encoded_input.input_ids
#    input_num_tokens = input_ids.shape[1]
#
#    print( "Number of input tokens: " + str(input_num_tokens))
#    print("Length of input: " + str(len(input_text)))
#
#    list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
#    print( "Tokens : " + ' '.join(list_of_tokens))
#    with torch.no_grad():
#
#        outputs = model(**encoded_input)
#        last_hidden_states = outputs[0]
#        sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#        #sentence_embedding = output.last_hidden_state[0][0]
#        return sentence_embedding.tolist()

e1 = get_sbert_embedding(a1)
e2 = get_sbert_embedding(a2)
#e3 = get_sbert_embedding(a3)
e4 = get_sbert_embedding(a4)
e5 = get_sbert_embedding(a5)
e6 = get_sbert_embedding(a6)
e7 = get_sbert_embedding(a7)
e8 = get_sbert_embedding(a8)
e8x = get_sbert_embedding(a8x)
#e9 = get_sbert_embedding(a9)
e10 = get_sbert_embedding(a10)
e11 = get_sbert_embedding(a11)
e12 = get_sbert_embedding(a12)
e13 = get_sbert_embedding(a13)
e14 = get_sbert_embedding(a14)
e15 = get_sbert_embedding(a15)

e16 = get_sbert_embedding(a16)
e17 = get_sbert_embedding(a17)
e18 = get_sbert_embedding(a18)

print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
print(cosine_similarity([e1], [e2]))
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
print(cosine_similarity([e1], [e4]))
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
print(cosine_similarity([e1], [e5]))

print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
print(cosine_similarity([e7], [e8]))
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
print(cosine_similarity([e7], [e8x]))

#print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
#print(cosine_similarity([e7], [e9]))

print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
print(cosine_similarity([e7], [e10]))

print(f"a11 \"{a11}\" to \"{a12}\" a12 - expected Same")
print(cosine_similarity([e11], [e12]))

print(f"a11 \"{a11}\" to \"{a13}\" a13 - expected Different")
print(cosine_similarity([e11], [e13]))

print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
print(cosine_similarity([e14], [e15]))

print(f"a16 \"{a16}\" to \"{a17}\" a17 - expected Same")
print(cosine_similarity([e16], [e17]))

print(f"a16 \"{a16}\" to \"{a18}\" a18 - expected Different")
print(cosine_similarity([e16], [e18]))

# with base
#a1 to a2
#[[0.99512167]]
#a1 to a4
#[[0.94850088]]
#a1 to a5
#[[0.99636901]]

# with large
#a1 to a2
#[[0.99682108]]
#a1 to a4
#[[0.94006972]]
#a1 to a5
#[[0.99503919]]