File size: 15,433 Bytes
9348d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71b3cf3
9348d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71b3cf3
9348d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71b3cf3
 
 
 
 
9348d85
 
 
 
 
 
 
 
 
71b3cf3
 
9348d85
 
 
 
 
71b3cf3
9348d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d512ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a73cc4b
8d512ce
 
a73cc4b
 
8d512ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4558d40
8d512ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
from functools import lru_cache
from fastai.text.all import *
from fastcore.all import *
import matplotlib.cm as cm
import html
import gradio as gr

learn_inf = load_learner("20210928-model.pkl")


def _value2rgba(x, cmap=cm.RdYlGn, alpha_mult=1.0):
    "Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`."
    c = cmap(x)
    rgb = (np.array(c[:-1]) * 255).astype(int)
    a = c[-1] * alpha_mult
    return tuple(rgb.tolist() + [a])


def _eval_dropouts(mod):
    module_name = mod.__class__.__name__
    if "Dropout" in module_name or "BatchNorm" in module_name:
        mod.training = False
    for module in mod.children():
        _eval_dropouts(module)


def _piece_attn_html(pieces, attns, sep=" ", **kwargs):
    html_code, spans = ['<span style="font-family: monospace;">'], []
    for p, a in zip(pieces, attns):
        p = html.escape(p)
        c = str(_value2rgba(a, alpha_mult=0.5, **kwargs))
        spans.append(
            f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>'
        )
    html_code.append(sep.join(spans))
    html_code.append("</span>")
    return "".join(html_code)


def _show_piece_attn(*args, **kwargs):
    from IPython.display import display, HTML

    display(HTML(_piece_attn_html(*args, **kwargs)))


@lru_cache(maxsize=1024 * 2)
def _intrinsic_attention(learn, text, class_id=None):
    "Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
    learn.model.train()
    _eval_dropouts(learn.model)
    learn.model.zero_grad()
    learn.model.reset()
    dl = learn.dls.test_dl([text])
    batch = next(iter(dl))[0]
    emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
    emb.retain_grad()
    lstm = learn.model[0].module(emb, True)
    learn.model.eval()
    cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[
        0
    ].softmax(dim=-1)
    if class_id is None:
        class_id = cl.argmax()
    cl[0][class_id].backward()
    attn = emb.grad.squeeze().abs().sum(dim=-1)
    attn /= attn.max()
    tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
    return tok, attn


@patch
def intrinsic_attention(x: TextLearner, text: str, class_id: int = None, **kwargs):
    "Shows the `intrinsic attention for `text`, optional `class_id`"
    if isinstance(x, LMLearner):
        raise Exception("Language models are not supported")
    text, attn = _intrinsic_attention(x, text, class_id)
    return _piece_attn_html(text.split(), to_np(attn), **kwargs)


labels = learn_inf.dls.vocab[1]


@lru_cache(maxsize=1024 * 2)
def predict_label(title):
    *_, probs = learn_inf.predict(title)
    return probs


def predict(title):
    # *_, probs = learn_inf.predict(title)

    probs = predict_label(title)
    return learn_inf.intrinsic_attention(title), {
        labels[i]: float(probs[i]) for i in range(len(labels))
    }


sample_text = [
    [
        "Poems on various subjects. Whereto is prefixed a short essay on the structure of English verse"
    ],
    [
        "Journal of a Residence in China and the neighbouring countries from 1830 to 1833. With an introductory essay by the Hon. and Rev. Baptist Wriothesley Noel. [With a map.]"
    ],
    ["The Adventures of Oliver Twist. [With plates.]"],
    ["['The Adventures of Sherlock Holmes', 'Single Works']"],
    [
        "['Coal, Iron, and Oil; or, the Practical American miner. A plain and popular work on our mines and mineral resources ... With numerous maps and engravings, etc']"
    ],
    [
        "Summer Travelling in Iceland; being the narrative of two journeys across the island ... With a chapter on Askja by E. Delmar Morgan ... Containing also a literal translation of three sagas. Maps, etc'"
    ],
    [
        "Histoire de France au moyen aÃÇge, depuis Philippe-Auguste jusqu'aÃÄ la fin du reÃÄgne de Louis XI. 1223-1483. Troisieme eÃÅdition"
    ],
    [
        "Two Centuries of Soho: its institutions, firms, and amusements. By the Clergy of St. Anne's, Soho, J. H. Cardwell ... H. B. Freeman ... G. C. Wilton ... assisted by other contributors, etc"
    ],
    ["""A Christmas Carol"""],
]

description = """
British Library Books genre detection model
"""

article = """
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5245175.svg)](https://doi.org/10.5281/zenodo.5245175)

# British Library Books genre detection demo

This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.

## Background 

This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon. 

## Model description

This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection. 
This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.


## Training data

The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
In particular the training data, was mostly English, and mostly from the 19th Century. You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))

## Model performance

The models performance on a held-out test set is as follows:


```
             precision    recall  f1-score   support

     Fiction       0.91      0.88      0.90       296
 Non-fiction       0.94      0.95      0.95       554

    accuracy                           0.93       850
   macro avg       0.93      0.92      0.92       850
weighted avg       0.93      0.93      0.93       850
```

> Credit: This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London. 

"""

gr_interface = gr.Interface(
    fn=predict,
    inputs=gr.inputs.Textbox(),
    outputs=[
        gr.outputs.HTML("Intrinsic attention"),
        gr.outputs.Label(num_top_classes=len(labels), label="Confidence"),
    ],
    title="British Library 19th Century Books Genre Classifier",
    description=description,
    article=article,
    examples=sample_text,
    allow_screenshot=True,
)
gr_interface.launch(inline=False, share=False)
from functools import lru_cache
from fastai.text.all import *
from fastcore.all import *
import matplotlib.cm as cm
import html
import gradio as gr

learn_inf = load_learner("20210928-model.pkl")


def _value2rgba(x, cmap=cm.RdYlGn, alpha_mult=1.0):
    "Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`."
    c = cmap(x)
    rgb = (np.array(c[:-1]) * 255).astype(int)
    a = c[-1] * alpha_mult
    return tuple(rgb.tolist() + [a])


def _eval_dropouts(mod):
    module_name = mod.__class__.__name__
    if "Dropout" in module_name or "BatchNorm" in module_name:
        mod.training = False
    for module in mod.children():
        _eval_dropouts(module)


def _piece_attn_html(pieces, attns, sep=" ", **kwargs):
    html_code, spans = ['<span style="font-family: monospace;">'], []
    for p, a in zip(pieces, attns):
        p = html.escape(p)
        c = str(_value2rgba(a, alpha_mult=0.5, **kwargs))
        spans.append(
            f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>'
        )
    html_code.append(sep.join(spans))
    html_code.append("</span>")
    return "".join(html_code)


def _show_piece_attn(*args, **kwargs):
    from IPython.display import display, HTML

    display(HTML(_piece_attn_html(*args, **kwargs)))


@lru_cache(maxsize=1024 * 2)
def _intrinsic_attention(learn, text, class_id=None):
    "Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
    learn.model.train()
    _eval_dropouts(learn.model)
    learn.model.zero_grad()
    learn.model.reset()
    dl = learn.dls.test_dl([text])
    batch = next(iter(dl))[0]
    emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
    emb.retain_grad()
    lstm = learn.model[0].module(emb, True)
    learn.model.eval()
    cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[
        0
    ].softmax(dim=-1)
    if class_id is None:
        class_id = cl.argmax()
    cl[0][class_id].backward()
    attn = emb.grad.squeeze().abs().sum(dim=-1)
    attn /= attn.max()
    tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
    return tok, attn


@patch
def intrinsic_attention(x: TextLearner, text: str, class_id: int = None, **kwargs):
    "Shows the `intrinsic attention for `text`, optional `class_id`"
    if isinstance(x, LMLearner):
        raise Exception("Language models are not supported")
    text, attn = _intrinsic_attention(x, text, class_id)
    return _piece_attn_html(text.split(), to_np(attn), **kwargs)


labels = learn_inf.dls.vocab[1]


@lru_cache(maxsize=1024 * 2)
def predict_label(title):
    *_, probs = learn_inf.predict(title)
    return probs


def predict(title):
    # *_, probs = learn_inf.predict(title)

    probs = predict_label(title)
    return learn_inf.intrinsic_attention(title), {
        labels[i]: float(probs[i]) for i in range(len(labels))
    }


sample_text = [
    [
        "Poems on various subjects. Whereto is prefixed a short essay on the structure of English verse"
    ],
    [
        "Journal of a Residence in China and the neighbouring countries from 1830 to 1833. With an introductory essay by the Hon. and Rev. Baptist Wriothesley Noel. [With a map.]"
    ],
    ["The Adventures of Oliver Twist. [With plates.]"],
    ["['The Adventures of Sherlock Holmes', 'Single Works']"],
    [
        "['Coal, Iron, and Oil; or, the Practical American miner. A plain and popular work on our mines and mineral resources ... With numerous maps and engravings, etc']"
    ],
    [
        "Summer Travelling in Iceland; being the narrative of two journeys across the island ... With a chapter on Askja by E. Delmar Morgan ... Containing also a literal translation of three sagas. Maps, etc'"
    ],
    [
        "Histoire de France au moyen aÃÇge, depuis Philippe-Auguste jusqu'aÃÄ la fin du reÃÄgne de Louis XI. 1223-1483. Troisieme eÃÅdition"
    ],
    [
        "Two Centuries of Soho: its institutions, firms, and amusements. By the Clergy of St. Anne's, Soho, J. H. Cardwell ... H. B. Freeman ... G. C. Wilton ... assisted by other contributors, etc"
    ],
    ["""A Christmas Carol"""],
]

description = """
British Library Books genre detection model
"""

article = """
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5245175.svg)](https://doi.org/10.5281/zenodo.5245175)

# British Library Books genre detection demo

This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
The model was trained with the [fastai](https://docs.fast.ai/) library on training data drawn from ditised books at the British Library. These Books are mainly from the 19th Century. 
The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.

The examples include titles from the BL books collection. 

## Background 

This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon. 

## Model description

This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection. 
This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.


## Training data

The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
In particular the training data, was mostly English, and mostly from the 19th Century. You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))

## Model performance

The models performance on a held-out test set is as follows:


```
             precision    recall  f1-score   support

     Fiction       0.91      0.88      0.90       296
 Non-fiction       0.94      0.95      0.95       554

    accuracy                           0.93       850
   macro avg       0.93      0.92      0.92       850
weighted avg       0.93      0.93      0.93       850
```

> Credits: This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London. 

> Code for showing attention was adapted from Zach Mueller's (@TheZachMueller) [fastinference](https://muellerzr.github.io/fastinference/) library. 

"""

gr_interface = gr.Interface(
    fn=predict,
    inputs=gr.inputs.Textbox(),
    outputs=[
        gr.outputs.HTML("Intrinsic attention"),
        gr.outputs.Label(num_top_classes=len(labels), label="Confidence"),
    ],
    title="British Library 19th Century Books Genre Classifier",
    description=description,
    article=article,
    examples=sample_text,
    allow_screenshot=True,
)
gr_interface.launch(inline=False, share=False)