davanstrien HF staff commited on
Commit
e0eeac1
·
1 Parent(s): e835bc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -173
app.py CHANGED
@@ -129,8 +129,11 @@ article = """
129
  # British Library Books genre detection demo
130
 
131
  This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
 
132
  The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.
133
 
 
 
134
  ## Background
135
 
136
  This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon.
@@ -139,12 +142,12 @@ This model was developed as part of work by the [Living with Machines](https://l
139
 
140
  This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection.
141
  This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.
142
-
143
 
144
  ## Training data
145
 
146
  The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
147
- In particular the training data, was mostly English, and mostly from the 19th Century. You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))
148
 
149
  ## Model performance
150
 
@@ -162,7 +165,9 @@ The models performance on a held-out test set is as follows:
162
  weighted avg 0.93 0.93 0.93 850
163
  ```
164
 
165
- > Credit: This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London.
 
 
166
 
167
  """
168
 
@@ -180,178 +185,8 @@ gr_interface = gr.Interface(
180
  allow_screenshot=True,
181
  )
182
  gr_interface.launch(inline=False, share=False)
183
- from functools import lru_cache
184
- from fastai.text.all import *
185
- from fastcore.all import *
186
- import matplotlib.cm as cm
187
- import html
188
- import gradio as gr
189
-
190
- learn_inf = load_learner("20210928-model.pkl")
191
-
192
-
193
- def _value2rgba(x, cmap=cm.RdYlGn, alpha_mult=1.0):
194
- "Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`."
195
- c = cmap(x)
196
- rgb = (np.array(c[:-1]) * 255).astype(int)
197
- a = c[-1] * alpha_mult
198
- return tuple(rgb.tolist() + [a])
199
-
200
-
201
- def _eval_dropouts(mod):
202
- module_name = mod.__class__.__name__
203
- if "Dropout" in module_name or "BatchNorm" in module_name:
204
- mod.training = False
205
- for module in mod.children():
206
- _eval_dropouts(module)
207
-
208
-
209
- def _piece_attn_html(pieces, attns, sep=" ", **kwargs):
210
- html_code, spans = ['<span style="font-family: monospace;">'], []
211
- for p, a in zip(pieces, attns):
212
- p = html.escape(p)
213
- c = str(_value2rgba(a, alpha_mult=0.5, **kwargs))
214
- spans.append(
215
- f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>'
216
- )
217
- html_code.append(sep.join(spans))
218
- html_code.append("</span>")
219
- return "".join(html_code)
220
-
221
-
222
- def _show_piece_attn(*args, **kwargs):
223
- from IPython.display import display, HTML
224
-
225
- display(HTML(_piece_attn_html(*args, **kwargs)))
226
 
227
 
228
- @lru_cache(maxsize=1024 * 2)
229
- def _intrinsic_attention(learn, text, class_id=None):
230
- "Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
231
- learn.model.train()
232
- _eval_dropouts(learn.model)
233
- learn.model.zero_grad()
234
- learn.model.reset()
235
- dl = learn.dls.test_dl([text])
236
- batch = next(iter(dl))[0]
237
- emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
238
- emb.retain_grad()
239
- lstm = learn.model[0].module(emb, True)
240
- learn.model.eval()
241
- cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[
242
- 0
243
- ].softmax(dim=-1)
244
- if class_id is None:
245
- class_id = cl.argmax()
246
- cl[0][class_id].backward()
247
- attn = emb.grad.squeeze().abs().sum(dim=-1)
248
- attn /= attn.max()
249
- tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
250
- return tok, attn
251
-
252
-
253
- @patch
254
- def intrinsic_attention(x: TextLearner, text: str, class_id: int = None, **kwargs):
255
- "Shows the `intrinsic attention for `text`, optional `class_id`"
256
- if isinstance(x, LMLearner):
257
- raise Exception("Language models are not supported")
258
- text, attn = _intrinsic_attention(x, text, class_id)
259
- return _piece_attn_html(text.split(), to_np(attn), **kwargs)
260
-
261
-
262
- labels = learn_inf.dls.vocab[1]
263
-
264
-
265
- @lru_cache(maxsize=1024 * 2)
266
- def predict_label(title):
267
- *_, probs = learn_inf.predict(title)
268
- return probs
269
-
270
-
271
- def predict(title):
272
- # *_, probs = learn_inf.predict(title)
273
-
274
- probs = predict_label(title)
275
- return learn_inf.intrinsic_attention(title), {
276
- labels[i]: float(probs[i]) for i in range(len(labels))
277
- }
278
-
279
-
280
- sample_text = [
281
- [
282
- "Poems on various subjects. Whereto is prefixed a short essay on the structure of English verse"
283
- ],
284
- [
285
- "Journal of a Residence in China and the neighbouring countries from 1830 to 1833. With an introductory essay by the Hon. and Rev. Baptist Wriothesley Noel. [With a map.]"
286
- ],
287
- ["The Adventures of Oliver Twist. [With plates.]"],
288
- ["['The Adventures of Sherlock Holmes', 'Single Works']"],
289
- [
290
- "['Coal, Iron, and Oil; or, the Practical American miner. A plain and popular work on our mines and mineral resources ... With numerous maps and engravings, etc']"
291
- ],
292
- [
293
- "Summer Travelling in Iceland; being the narrative of two journeys across the island ... With a chapter on Askja by E. Delmar Morgan ... Containing also a literal translation of three sagas. Maps, etc'"
294
- ],
295
- [
296
- "Histoire de France au moyen aÃÇge, depuis Philippe-Auguste jusqu'aÃÄ la fin du reÃÄgne de Louis XI. 1223-1483. Troisieme eÃÅdition"
297
- ],
298
- [
299
- "Two Centuries of Soho: its institutions, firms, and amusements. By the Clergy of St. Anne's, Soho, J. H. Cardwell ... H. B. Freeman ... G. C. Wilton ... assisted by other contributors, etc"
300
- ],
301
- ["""A Christmas Carol"""],
302
- ]
303
-
304
- description = """
305
- British Library Books genre detection model
306
- """
307
-
308
- article = """
309
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5245175.svg)](https://doi.org/10.5281/zenodo.5245175)
310
-
311
- # British Library Books genre detection demo
312
-
313
- This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
314
- The model was trained with the [fastai](https://docs.fast.ai/) library on training data drawn from ditised books at the British Library. These Books are mainly from the 19th Century.
315
- The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.
316
-
317
- The examples include titles from the BL books collection.
318
-
319
- ## Background
320
-
321
- This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon.
322
-
323
- ## Model description
324
-
325
- This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection.
326
- This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.
327
- You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))
328
-
329
- ## Training data
330
-
331
- The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
332
- In particular the training data, was mostly English, and mostly from the 19th Century.
333
-
334
- ## Model performance
335
-
336
- The models performance on a held-out test set is as follows:
337
-
338
-
339
- ```
340
- precision recall f1-score support
341
-
342
- Fiction 0.91 0.88 0.90 296
343
- Non-fiction 0.94 0.95 0.95 554
344
-
345
- accuracy 0.93 850
346
- macro avg 0.93 0.92 0.92 850
347
- weighted avg 0.93 0.93 0.93 850
348
- ```
349
-
350
- ### Credits
351
- > This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London.
352
- > Code for showing attention was adapted from Zach Mueller's (@TheZachMueller) [fastinference](https://muellerzr.github.io/fastinference/) library.
353
-
354
- """
355
 
356
  gr_interface = gr.Interface(
357
  fn=predict,
 
129
  # British Library Books genre detection demo
130
 
131
  This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
132
+ The model was trained with the [fastai](https://docs.fast.ai/) library on training data drawn from ditised books at the British Library. These Books are mainly from the 19th Century.
133
  The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.
134
 
135
+ The examples include titles from the BL books collection.
136
+
137
  ## Background
138
 
139
  This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon.
 
142
 
143
  This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection.
144
  This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.
145
+ You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))
146
 
147
  ## Training data
148
 
149
  The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
150
+ In particular the training data, was mostly English, and mostly from the 19th Century.
151
 
152
  ## Model performance
153
 
 
165
  weighted avg 0.93 0.93 0.93 850
166
  ```
167
 
168
+ ### Credits
169
+ > This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London.
170
+ > Code for showing attention was adapted from Zach Mueller's (@TheZachMueller) [fastinference](https://muellerzr.github.io/fastinference/) library.
171
 
172
  """
173
 
 
185
  allow_screenshot=True,
186
  )
187
  gr_interface.launch(inline=False, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  gr_interface = gr.Interface(
192
  fn=predict,