hanchier commited on
Commit
3ecbde7
1 Parent(s): acd6966

word embeddings

Browse files
Files changed (1) hide show
  1. app.py +10 -15
app.py CHANGED
@@ -36,7 +36,7 @@ def word_embedding_space_analysis(
36
  S, V, D = torch.linalg.svd(matrix)
37
 
38
  data = []
39
- top = 30
40
  select_words = 20
41
  n_dim = 10
42
  for _i in range(n_dim):
@@ -54,15 +54,16 @@ def word_embedding_space_analysis(
54
  word = word[1:]
55
  if word.lower() in nltk.corpus.words.words():
56
  output.append(word)
57
- return output[:select_words]
58
 
59
- data.append([
60
- ", ".join(filter_words(side_tokens))
61
- for side_tokens in [left_tokens, right_tokens]
62
- ])
 
63
  return pd.DataFrame(
64
  data,
65
- columns=["One Direction", "Another Direction"],
66
  index=[f"Dim#{_i}" for _i in range(n_dim)],
67
  )
68
 
@@ -196,7 +197,7 @@ def main():
196
  # Analysing the sentence
197
  st.divider()
198
  st.divider()
199
- st.subheader("LM-Steer Converts LMs into Text Analyzers")
200
  '''
201
  LM-Steer also serves as a probe for analyzing the text. It can be used to
202
  analyze the sentiment and detoxification of the text. Now, we proceed and
@@ -267,14 +268,8 @@ def main():
267
  embeddings: what word dimensions contribute to or contrast to a specific
268
  style. This analysis can be used to understand the word embedding space
269
  and how it steers the model's generation.
270
-
271
- Note that due to the bidirectional nature of the embedding spaces, in each
272
- dimension, sometimes only one side of the word embeddings contributes
273
- (has an impact on the style), while the other side, (resulting in negative
274
- logits) has a negligible impact on the style. The table below shows both
275
- sides of the word embeddings in each dimension.
276
  '''
277
- for dimension in ["Sentiment", "Detoxification"]:
278
  f'##### {dimension} Word Dimensions'
279
  dim = 2 if dimension == "Sentiment" else 0
280
  analysis_result = word_embedding_space_analysis(
 
36
  S, V, D = torch.linalg.svd(matrix)
37
 
38
  data = []
39
+ top = 50
40
  select_words = 20
41
  n_dim = 10
42
  for _i in range(n_dim):
 
54
  word = word[1:]
55
  if word.lower() in nltk.corpus.words.words():
56
  output.append(word)
57
+ return output
58
 
59
+ left_tokens = filter_words(left_tokens)
60
+ right_tokens = filter_words(right_tokens)
61
+ if len(left_tokens) < len(right_tokens):
62
+ left_tokens = right_tokens
63
+ data.append(", ".join(left_tokens[:select_words]))
64
  return pd.DataFrame(
65
  data,
66
+ columns=["Words Contributing to the Style"],
67
  index=[f"Dim#{_i}" for _i in range(n_dim)],
68
  )
69
 
 
197
  # Analysing the sentence
198
  st.divider()
199
  st.divider()
200
+ st.subheader("LM-Steer Converts Any LM Into A Text Analyzer")
201
  '''
202
  LM-Steer also serves as a probe for analyzing the text. It can be used to
203
  analyze the sentiment and detoxification of the text. Now, we proceed and
 
268
  embeddings: what word dimensions contribute to or contrast to a specific
269
  style. This analysis can be used to understand the word embedding space
270
  and how it steers the model's generation.
 
 
 
 
 
 
271
  '''
272
+ for dimension in ["Detoxification", "Sentiment"]:
273
  f'##### {dimension} Word Dimensions'
274
  dim = 2 if dimension == "Sentiment" else 0
275
  analysis_result = word_embedding_space_analysis(