jskim commited on
Commit
091bb76
·
1 Parent(s): 0532283

visualizing more direct information upfront. leaving interactive parts as the next step.

Browse files
Files changed (2) hide show
  1. app.py +186 -19
  2. score.py +42 -6
app.py CHANGED
@@ -43,7 +43,7 @@ def get_similar_paper(
43
  name, papers = get_text_from_author_id(author_id_input)
44
 
45
  # Compute Doc-level affinity scores for the Papers
46
- print('computing scores...')
47
  # TODO detect duplicate papers?
48
  titles, abstracts, doc_scores = compute_document_score(
49
  doc_model,
@@ -72,40 +72,77 @@ def get_similar_paper(
72
  start = time.time()
73
  input_sentences = sent_tokenize(abstract_text_input)
74
  num_sents = len(input_sentences)
 
 
75
  for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
76
  # Compute sent-level and phrase-level affinity scores for each papers
77
- sent_ids, sent_scores, info = get_highlight_info(
78
  sent_model,
79
  abstract_text_input,
80
  ab,
81
  K=2
82
  )
83
-
84
- word_scores = dict()
85
 
86
- # different highlights for each input sentence
 
87
  for i in range(num_sents):
88
  word_scores[str(i)] = {
89
  "original": ab,
90
  "interpretation": list(zip(info['all_words'], info[i]['scores']))
91
- } # format to feed to for Gradio Interpretation component
92
 
93
  tmp[display_title[aa]] = {
94
  'title': tt,
95
  'abstract': ab,
96
  'doc_score': ds,
97
  'source_sentences': input_sentences,
98
- 'highlight': word_scores
 
99
  }
100
- pickle.dump(tmp, open('info.pkl', 'wb')) # TODO better ways of saving intermediate results?
 
 
101
  end = time.time()
102
- print('done in [%0.2f] seconds'%(end - start))
 
 
 
 
 
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return (
105
- gr.update(choices=display_title, interactive=True, visible=True), # set of papers
106
- gr.update(choices=input_sentences, interactive=True, visible=True), # submission sentences
107
- gr.update(visible=True), # title row
108
- gr.update(visible=True), # abstract row
109
  )
110
 
111
  def update_name(author_id_input):
@@ -147,6 +184,7 @@ with gr.Blocks() as demo:
147
  # Text description about the app and disclaimer
148
  ### TEXT Description
149
  # TODO add instruction video link
 
150
  gr.Markdown(
151
  """
152
  # Paper Matching Helper
@@ -186,9 +224,93 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
186
  author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
187
  with gr.Row():
188
  compute_btn = gr.Button('What Makes This a Good Match?')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  ### PAPER INFORMATION
191
-
192
  # show multiple papers in radio check box to select from
193
  with gr.Row():
194
  selected_papers_radio = gr.Radio(
@@ -205,9 +327,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
205
  affinity= gr.Number(label='Affinity', interactive=False, value=0)
206
  with gr.Row():
207
  paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
208
-
209
- ## TODO consider adding more direct information feeding to the users before giving them options for interactions.
210
-
211
  ### RELEVANT PARTS (HIGHLIGHTS)
212
  with gr.Row():
213
  with gr.Column(scale=2): # text from submission
@@ -221,7 +341,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
221
 
222
  ### EVENT LISTENERS
223
 
224
- # retrieve similar papers
225
  compute_btn.click(
226
  fn=get_similar_paper,
227
  inputs=[
@@ -229,13 +349,60 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
229
  pdf_file_input,
230
  author_id_input
231
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  outputs=[
233
  selected_papers_radio,
234
  source_sentences,
235
  title_row,
236
  paper_abstract
237
  ]
238
- )
239
 
240
  # change highlight based on selected sentences from submission
241
  source_sentences.change(
 
43
  name, papers = get_text_from_author_id(author_id_input)
44
 
45
  # Compute Doc-level affinity scores for the Papers
46
+ print('computing document scores...')
47
  # TODO detect duplicate papers?
48
  titles, abstracts, doc_scores = compute_document_score(
49
  doc_model,
 
72
  start = time.time()
73
  input_sentences = sent_tokenize(abstract_text_input)
74
  num_sents = len(input_sentences)
75
+
76
+ summary_info = dict() # elements to visualize upfront
77
  for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
78
  # Compute sent-level and phrase-level affinity scores for each papers
79
+ sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
80
  sent_model,
81
  abstract_text_input,
82
  ab,
83
  K=2
84
  )
 
 
85
 
86
+ # get scores for each word in the format for Gradio Interpretation component
87
+ word_scores = dict()
88
  for i in range(num_sents):
89
  word_scores[str(i)] = {
90
  "original": ab,
91
  "interpretation": list(zip(info['all_words'], info[i]['scores']))
92
+ }
93
 
94
  tmp[display_title[aa]] = {
95
  'title': tt,
96
  'abstract': ab,
97
  'doc_score': ds,
98
  'source_sentences': input_sentences,
99
+ 'highlight': word_scores,
100
+ 'top_pairs': top_pairs_info
101
  }
102
+
103
+ # TODO better ways of saving intermediate results? user identifiers per session?
104
+ pickle.dump(tmp, open('info.pkl', 'wb'))
105
  end = time.time()
106
+ print('done in [%0.2f] seconds'%(end - start))
107
+
108
+ # set up elements to show
109
+ out = [
110
+ gr.update(choices=display_title, interactive=True, visible=False), # set of papers (radio)
111
+ gr.update(choices=input_sentences, interactive=True, visible=False) # submission sentences
112
+ ]
113
 
114
+ # set up elements to visualize upfront
115
+ top_papers_show = 3 # number of top papers to show upfront
116
+ top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
117
+ summary_out = []
118
+ for i in range(top_papers_show):
119
+ out_tmp = [
120
+ gr.update(value=titles[i], visible=True),
121
+ gr.update(value=doc_scores[i], visible=True)
122
+ ]
123
+ tp = tmp[display_title[i]]['top_pairs']
124
+ for j in range(top_num_info_show):
125
+ out_tmp += [
126
+ gr.update(value=tp[j]['score'], visible=True),
127
+ tp[j]['query']['original'],
128
+ tp[j]['query'],
129
+ tp[j]['candidate']['original'],
130
+ tp[j]['candidate']
131
+ ]
132
+ summary_out += out_tmp
133
+
134
+ # add updates to the show more button
135
+ out = out + summary_out + [gr.update(visible=True)] # show more button
136
+ assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
137
+
138
+ return tuple(out)
139
+
140
+ def show_more():
141
  return (
142
+ gr.update(visible=True), # set of papers
143
+ gr.update(visible=True), # submission sentences
144
+ gr.update(visible=True), # title row
145
+ gr.update(visible=True), # abstract row
146
  )
147
 
148
  def update_name(author_id_input):
 
184
  # Text description about the app and disclaimer
185
  ### TEXT Description
186
  # TODO add instruction video link
187
+ # TODO udpate instruction based on new changes
188
  gr.Markdown(
189
  """
190
  # Paper Matching Helper
 
224
  author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
225
  with gr.Row():
226
  compute_btn = gr.Button('What Makes This a Good Match?')
227
+
228
+
229
+ ### OVERVIEW
230
+ # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
231
+ # TODO blockfy similar components together and simplify
232
+ ## ONE BLOCK OF INFO FOR A SINGLE PAPER
233
+ ## PAPER1
234
+ with gr.Row():
235
+ with gr.Column(scale=3):
236
+ paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
237
+ with gr.Column(scale=1):
238
+ affinity1 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
239
+ with gr.Row() as rel1_1:
240
+ with gr.Column(scale=1):
241
+ sent_pair_score1_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
242
+ with gr.Column(scale=4):
243
+ sent_pair_source1_1 = gr.Textbox(label='Sentence from Submission', visible=False)
244
+ sent_pair_source1_1_hl = gr.components.Interpretation(sent_pair_source1_1)
245
+ with gr.Column(scale=4):
246
+ sent_pair_candidate1_1 = gr.Textbox(label='Sentence from Paper', visible=False)
247
+ sent_pair_candidate1_1_hl = gr.components.Interpretation(sent_pair_candidate1_1)
248
+ with gr.Row() as rel1_2:
249
+ with gr.Column(scale=1):
250
+ sent_pair_score1_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
251
+ with gr.Column(scale=4):
252
+ sent_pair_source1_2 = gr.Textbox(label='Sentence from Submission', visible=False)
253
+ sent_pair_source1_2_hl = gr.components.Interpretation(sent_pair_source1_2)
254
+ with gr.Column(scale=4):
255
+ sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
256
+ sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
257
+
258
+ ## PAPER 2
259
+ with gr.Row():
260
+ with gr.Column(scale=3):
261
+ paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
262
+ with gr.Column(scale=1):
263
+ affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
264
+ with gr.Row() as rel2_1:
265
+ with gr.Column(scale=1):
266
+ sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
267
+ with gr.Column(scale=4):
268
+ sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
269
+ sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
270
+ with gr.Column(scale=4):
271
+ sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
272
+ sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
273
+ with gr.Row() as rel2_2:
274
+ with gr.Column(scale=1):
275
+ sent_pair_score2_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
276
+ with gr.Column(scale=4):
277
+ sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
278
+ sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
279
+ with gr.Column(scale=4):
280
+ sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
281
+ sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
282
+
283
+ ## PAPER 3
284
+ with gr.Row():
285
+ with gr.Column(scale=3):
286
+ paper_title3 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
287
+ with gr.Column(scale=1):
288
+ affinity3 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
289
+ with gr.Row() as rel3_1:
290
+ with gr.Column(scale=1):
291
+ sent_pair_score3_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
292
+ with gr.Column(scale=4):
293
+ sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
294
+ sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
295
+ with gr.Column(scale=4):
296
+ sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
297
+ sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
298
+ with gr.Row() as rel3_2:
299
+ with gr.Column(scale=1):
300
+ sent_pair_score3_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
301
+ with gr.Column(scale=4):
302
+ sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
303
+ sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
304
+ with gr.Column(scale=4):
305
+ sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
306
+ sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
307
+
308
+ ## Show more button
309
+ with gr.Row():
310
+ see_more_rel_btn = gr.Button('See more relevant parts from papers', visible=False)
311
 
312
  ### PAPER INFORMATION
313
+
314
  # show multiple papers in radio check box to select from
315
  with gr.Row():
316
  selected_papers_radio = gr.Radio(
 
327
  affinity= gr.Number(label='Affinity', interactive=False, value=0)
328
  with gr.Row():
329
  paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
330
+
 
 
331
  ### RELEVANT PARTS (HIGHLIGHTS)
332
  with gr.Row():
333
  with gr.Column(scale=2): # text from submission
 
341
 
342
  ### EVENT LISTENERS
343
 
344
+ # retrieve similar papers and show top results
345
  compute_btn.click(
346
  fn=get_similar_paper,
347
  inputs=[
 
349
  pdf_file_input,
350
  author_id_input
351
  ],
352
+ outputs=[
353
+ selected_papers_radio,
354
+ source_sentences,
355
+ paper_title1, # paper info
356
+ affinity1,
357
+ sent_pair_score1_1,
358
+ sent_pair_source1_1,
359
+ sent_pair_source1_1_hl,
360
+ sent_pair_candidate1_1,
361
+ sent_pair_candidate1_1_hl,
362
+ sent_pair_score1_2,
363
+ sent_pair_source1_2,
364
+ sent_pair_source1_2_hl,
365
+ sent_pair_candidate1_2,
366
+ sent_pair_candidate1_2_hl,
367
+ paper_title2,
368
+ affinity2,
369
+ sent_pair_score2_1,
370
+ sent_pair_source2_1,
371
+ sent_pair_source2_1_hl,
372
+ sent_pair_candidate2_1,
373
+ sent_pair_candidate2_1_hl,
374
+ sent_pair_score2_2,
375
+ sent_pair_source2_2,
376
+ sent_pair_source2_2_hl,
377
+ sent_pair_candidate2_2,
378
+ sent_pair_candidate2_2_hl,
379
+ paper_title3,
380
+ affinity3,
381
+ sent_pair_score3_1,
382
+ sent_pair_source3_1,
383
+ sent_pair_source3_1_hl,
384
+ sent_pair_candidate3_1,
385
+ sent_pair_candidate3_1_hl,
386
+ sent_pair_score3_2,
387
+ sent_pair_source3_2,
388
+ sent_pair_source3_2_hl,
389
+ sent_pair_candidate3_2,
390
+ sent_pair_candidate3_2_hl,
391
+ see_more_rel_btn
392
+ ]
393
+ )
394
+
395
+ # Get more info (move to more interactive portion)
396
+ see_more_rel_btn.click(
397
+ fn=show_more,
398
+ inputs=None,
399
  outputs=[
400
  selected_papers_radio,
401
  source_sentences,
402
  title_row,
403
  paper_abstract
404
  ]
405
+ )
406
 
407
  # change highlight based on selected sentences from submission
408
  source_sentences.change(
score.py CHANGED
@@ -6,7 +6,6 @@ import numpy as np
6
  import tqdm
7
 
8
  def compute_sentencewise_scores(model, query_sents, candidate_sents):
9
- # TODO make this more general for different types of models
10
  # list of sentences from query and candidate
11
  q_v, c_v = get_embedding(model, query_sents, candidate_sents)
12
 
@@ -74,8 +73,10 @@ def get_match_phrase(w1, w2, method='pos'):
74
  pos2 = pos_tag(w2)
75
  for i, (w, p) in enumerate(pos2):
76
  if w.lower() in w1 and p in include:
 
77
  mask2[i] = 1
78
- return mask2
 
79
 
80
  def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
81
  """
@@ -102,12 +103,12 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
102
  sent_range = (sent_start_id[sid], sent_start_id[sid+1])
103
  is_selected_sent[sent_range[0]:sent_range[1]] = 1
104
  word_scores[sent_range[0]:sent_range[1]] = sscore
105
- is_selected_phrase[sent_range[0]:sent_range[1]] = \
106
  get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
107
  else:
108
  is_selected_sent[sent_start_id[sid]:] = 1
109
  word_scores[sent_start_id[sid]:] = sscore
110
- is_selected_phrase[sent_start_id[sid]:] = \
111
  get_match_phrase(query_words, all_words[sent_start_id[sid]:])
112
 
113
  # update selected phrase scores (-1 meaning a different color in gradio)
@@ -135,7 +136,42 @@ def get_highlight_info(model, text1, text2, K=None):
135
  words2, all_words2, sent_start_id2 = get_words(sent2)
136
  info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
137
 
138
- return sent_ids, sent_scores, info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  ### Document-level operations
141
 
@@ -194,4 +230,4 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
194
  abstracts_sorted = [abstracts[x] for x in idx_sorted]
195
  scores_sorted = [scores[x] for x in idx_sorted]
196
 
197
- return titles_sorted, abstracts_sorted, scores_sorted
 
6
  import tqdm
7
 
8
  def compute_sentencewise_scores(model, query_sents, candidate_sents):
 
9
  # list of sentences from query and candidate
10
  q_v, c_v = get_embedding(model, query_sents, candidate_sents)
11
 
 
73
  pos2 = pos_tag(w2)
74
  for i, (w, p) in enumerate(pos2):
75
  if w.lower() in w1 and p in include:
76
+ j = w1.index(w.lower())
77
  mask2[i] = 1
78
+ mask1[j] = 1
79
+ return mask1, mask2
80
 
81
  def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
82
  """
 
103
  sent_range = (sent_start_id[sid], sent_start_id[sid+1])
104
  is_selected_sent[sent_range[0]:sent_range[1]] = 1
105
  word_scores[sent_range[0]:sent_range[1]] = sscore
106
+ _, is_selected_phrase[sent_range[0]:sent_range[1]] = \
107
  get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
108
  else:
109
  is_selected_sent[sent_start_id[sid]:] = 1
110
  word_scores[sent_start_id[sid]:] = sscore
111
+ _, is_selected_phrase[sent_start_id[sid]:] = \
112
  get_match_phrase(query_words, all_words[sent_start_id[sid]:])
113
 
114
  # update selected phrase scores (-1 meaning a different color in gradio)
 
136
  words2, all_words2, sent_start_id2 = get_words(sent2)
137
  info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
138
 
139
+ # get top sentence pairs from the query and candidate (score, index_pair)
140
+ top_pair_num = 5
141
+ top_pairs = []
142
+ ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
143
+ for i, j in zip(ii[0][::-1], ii[1][::-1]):
144
+ score = sent_scores[i,j]
145
+ index_pair = (i, sent_ids[i,j].item())
146
+ top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
147
+
148
+ # convert top_pairs to corresponding highlights format for GRadio Interpretation component
149
+ top_pairs_info = dict()
150
+ count = 0
151
+ for s, (sidq, sidc) in top_pairs:
152
+ q_sent = sent1[sidq]
153
+ c_sent = sent2[sidc]
154
+ q_words = word_tokenize(q_sent)
155
+ c_words = word_tokenize(c_sent)
156
+ mask1, mask2 = get_match_phrase(q_words, c_words)
157
+ mask1 *= -1 # mark matching phrases as blue
158
+ mask2 *= -1
159
+ assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
160
+ top_pairs_info[count] = {
161
+ 'query': {
162
+ 'original': q_sent,
163
+ 'interpretation': list(zip(q_words, mask1))
164
+ },
165
+ 'candidate': {
166
+ 'original': c_sent,
167
+ 'interpretation': list(zip(c_words, mask2))
168
+ },
169
+ 'score': s,
170
+ 'sent_idx': (sidq, sidc)
171
+ }
172
+ count += 1
173
+
174
+ return sent_ids, sent_scores, info, top_pairs_info
175
 
176
  ### Document-level operations
177
 
 
230
  abstracts_sorted = [abstracts[x] for x in idx_sorted]
231
  scores_sorted = [scores[x] for x in idx_sorted]
232
 
233
+ return titles_sorted, abstracts_sorted, scores_sorted