Spaces:
Runtime error
Runtime error
File size: 5,012 Bytes
ecdea0f 2619b03 3004443 2619b03 3004443 c96e9d6 4132514 ecdea0f 3004443 71667b3 6005876 ecdea0f f248e14 3004443 ecdea0f 3004443 2619b03 ce71282 0c1e501 ce71282 ecdea0f 3004443 2619b03 3004443 ce71282 3004443 6005876 ce71282 71667b3 2619b03 71667b3 2619b03 71667b3 2619b03 71667b3 3004443 71667b3 3004443 ce71282 3004443 71667b3 3004443 6005876 ce71282 2619b03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
#This is a quick evaluation on a few cases
# base
# large
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
#model = AlbertModel.from_pretrained("albert-base-v2")
#'sentence-transformers/paraphrase-albert-base-v2'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
model_name='output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
model_name='output/training_OnlineConstrativeLoss-2023-03-17_23-15-52'
model_sbert = SentenceTransformer(model_name)
def get_sbert_embedding(input_text):
embedding = model_sbert.encode(input_text)
return embedding.tolist()
a1 = "65 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
a2 = "112 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
a3 = "1677 NJ-27 #2, EDISON, NJ 08817"
a4 = "5078 S MARYLAND PKWY, LAS VEGAS, NV 89119"
a5 = "65 MOUNTAIN BOULEVARD EXT, WARREN, NJ 07059"
a6 = "123 BROAD ST, NEW YORK, NY, 10304-2345"
a7 = "440 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
a8 = "200 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
a8x= "87 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
#a9 = "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
a14="257 37 US ROUTE 11, EVANS MILLS, NY 13637"
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
a19 = "THE PAVILION AT QUEENS FOR REHABILITAION AND NURSING 36-17 PARSONS BOULEVARD, FLUSHING, NY 11354"
a20 = "136-17 39TH AVENUE, 4TH FLOOR, SUITE CF-E, FLUSHING, NY 11354"
a21="WISDOM MEDICAL P.C., 136-20 38 TH AVE 6E, FLUSHING, NY 11354"
encoded_input = tokenizer(a21, return_tensors='pt')
input_ids = encoded_input.input_ids
input_num_tokens = input_ids.shape[1]
print(input_num_tokens)
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
print( "Tokens : " + ' '.join(list_of_tokens))
#def get_embedding(input_text):
# encoded_input = tokenizer(input_text, return_tensors='pt')
# input_ids = encoded_input.input_ids
# input_num_tokens = input_ids.shape[1]
#
# print( "Number of input tokens: " + str(input_num_tokens))
# print("Length of input: " + str(len(input_text)))
#
# list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#
# print( "Tokens : " + ' '.join(list_of_tokens))
# with torch.no_grad():
#
# outputs = model(**encoded_input)
# last_hidden_states = outputs[0]
# sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
# #sentence_embedding = output.last_hidden_state[0][0]
# return sentence_embedding.tolist()
e1 = get_sbert_embedding(a1)
e2 = get_sbert_embedding(a2)
#e3 = get_sbert_embedding(a3)
e4 = get_sbert_embedding(a4)
e5 = get_sbert_embedding(a5)
e6 = get_sbert_embedding(a6)
e7 = get_sbert_embedding(a7)
e8 = get_sbert_embedding(a8)
e8x = get_sbert_embedding(a8x)
#e9 = get_sbert_embedding(a9)
e10 = get_sbert_embedding(a10)
e11 = get_sbert_embedding(a11)
e12 = get_sbert_embedding(a12)
e13 = get_sbert_embedding(a13)
e14 = get_sbert_embedding(a14)
e15 = get_sbert_embedding(a15)
e16 = get_sbert_embedding(a16)
e17 = get_sbert_embedding(a17)
e18 = get_sbert_embedding(a18)
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
print(cosine_similarity([e1], [e2]))
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
print(cosine_similarity([e1], [e4]))
print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
print(cosine_similarity([e1], [e5]))
print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
print(cosine_similarity([e7], [e8]))
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
print(cosine_similarity([e7], [e8x]))
#print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
#print(cosine_similarity([e7], [e9]))
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
print(cosine_similarity([e7], [e10]))
print(f"a11 \"{a11}\" to \"{a12}\" a12 - expected Same")
print(cosine_similarity([e11], [e12]))
print(f"a11 \"{a11}\" to \"{a13}\" a13 - expected Different")
print(cosine_similarity([e11], [e13]))
print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
print(cosine_similarity([e14], [e15]))
print(f"a16 \"{a16}\" to \"{a17}\" a17 - expected Same")
print(cosine_similarity([e16], [e17]))
print(f"a16 \"{a16}\" to \"{a18}\" a18 - expected Different")
print(cosine_similarity([e16], [e18]))
# with base
#a1 to a2
#[[0.99512167]]
#a1 to a4
#[[0.94850088]]
#a1 to a5
#[[0.99636901]]
# with large
#a1 to a2
#[[0.99682108]]
#a1 to a4
#[[0.94006972]]
#a1 to a5
#[[0.99503919]] |