rachith commited on
Commit
6bab54c
·
1 Parent(s): 6b9b713

preloading models brute force

Browse files
Files changed (1) hide show
  1. app.py +35 -32
app.py CHANGED
@@ -2,48 +2,51 @@ import gradio as gr
2
  from transformers import AutoModel, AutoTokenizer
3
  from sklearn.neighbors import NearestNeighbors
4
 
5
- available_models = ['cardiffnlp/twitter-roberta-base-2019-90m',
6
- 'cardiffnlp/twitter-roberta-base-jun2020']
7
 
8
- models = {}
9
- tokenizers = {}
 
 
 
 
 
 
 
 
10
 
11
- for MODEL in available_models:
12
- models[MODEL] = AutoModel.from_pretrained(MODEL)
13
- tokenizers[MODEL] = AutoTokenizer.from_pretrained(MODEL)
14
 
15
-
16
- def topk_model(MODEL):
17
- # MODEL = "cardiffnlp/twitter-roberta-base-jun2022"
18
- # model = AutoModel.from_pretrained(MODEL)
19
- # tokenizer = AutoTokenizer.from_pretrained(MODEL)
20
- embedding_matrix = models[MODEL].embeddings.word_embeddings.weight
21
- embedding_matrix = embedding_matrix.detach().numpy()
22
-
23
- knn_model = NearestNeighbors(n_neighbors=500,
24
- metric='cosine',
25
- algorithm='auto',
26
- n_jobs=3)
27
-
28
- nbrs = knn_model.fit(embedding_matrix)
29
-
30
- distances, indices = nbrs.kneighbors(embedding_matrix)
31
-
32
- return distances,indices,tokenizers[MODEL]
33
 
34
 
35
  title = "How does a word's meaning change with time?"
36
 
37
  def topk(word,model):
38
  outs = []
39
- distances, indices, tokenizer = topk_model(model)
40
-
41
- index = tokenizer.encode(f'{word}')
42
- for i in indices[index[1]]:
43
- outs.append(tokenizer.decode(i))
44
- print(tokenizer.decode(i))
45
 
46
- return outs
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # with gr.Blocks() as demo:
49
  # gr.Markdown(f" # {title}")
 
2
  from transformers import AutoModel, AutoTokenizer
3
  from sklearn.neighbors import NearestNeighbors
4
 
5
+ available_models = ['2019',
6
+ '2020']
7
 
8
+ model_2019 = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
9
+ tokenizers_2019 = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
10
+ embedding_matrix_2019 = model_2019.embeddings.word_embeddings.weight
11
+ embedding_matrix_2019 = embedding_matrix_2019.detach().numpy()
12
+ knn_model_2019 = NearestNeighbors(n_neighbors=500,
13
+ metric='cosine',
14
+ algorithm='auto',
15
+ n_jobs=3)
16
+ nbrs_2019 = knn_model_2019.fit(embedding_matrix_2019)
17
+ distances_2019, indices_2019 = nbrs_2019.kneighbors(embedding_matrix_2019)
18
 
 
 
 
19
 
20
+ model_2020 = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-jun2020')
21
+ tokenizers_2020 = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-jun2020')
22
+ embedding_matrix_2020 = model_2020.embeddings.word_embeddings.weight
23
+ embedding_matrix_2020 = embedding_matrix_2020.detach().numpy()
24
+ knn_model_2020 = NearestNeighbors(n_neighbors=500,
25
+ metric='cosine',
26
+ algorithm='auto',
27
+ n_jobs=3)
28
+ nbrs_2020 = knn_model_2020.fit(embedding_matrix_2020)
29
+ distances_2020, indices_2020 = nbrs_2020.kneighbors(embedding_matrix_2020)
 
 
 
 
 
 
 
 
30
 
31
 
32
  title = "How does a word's meaning change with time?"
33
 
34
  def topk(word,model):
35
  outs = []
 
 
 
 
 
 
36
 
37
+ if model == '2019':
38
+ index = tokenizers_2019.encode(f'{word}')
39
+ for i in indices_2019[index[1]]:
40
+ outs.append(tokenizers_2019.decode(i))
41
+ print(tokenizers_2019.decode(i))
42
+ return outs
43
+
44
+ if model == '2020':
45
+ index = tokenizers_2020.encode(f'{word}')
46
+ for i in indices_2020[index[1]]:
47
+ outs.append(tokenizers_2020.decode(i))
48
+ print(tokenizers_2020.decode(i))
49
+ return outs
50
 
51
  # with gr.Blocks() as demo:
52
  # gr.Markdown(f" # {title}")