File size: 15,433 Bytes
9348d85 71b3cf3 9348d85 71b3cf3 9348d85 71b3cf3 9348d85 71b3cf3 9348d85 71b3cf3 9348d85 8d512ce a73cc4b 8d512ce a73cc4b 8d512ce 4558d40 8d512ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
from functools import lru_cache
from fastai.text.all import *
from fastcore.all import *
import matplotlib.cm as cm
import html
import gradio as gr
learn_inf = load_learner("20210928-model.pkl")
def _value2rgba(x, cmap=cm.RdYlGn, alpha_mult=1.0):
"Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`."
c = cmap(x)
rgb = (np.array(c[:-1]) * 255).astype(int)
a = c[-1] * alpha_mult
return tuple(rgb.tolist() + [a])
def _eval_dropouts(mod):
module_name = mod.__class__.__name__
if "Dropout" in module_name or "BatchNorm" in module_name:
mod.training = False
for module in mod.children():
_eval_dropouts(module)
def _piece_attn_html(pieces, attns, sep=" ", **kwargs):
html_code, spans = ['<span style="font-family: monospace;">'], []
for p, a in zip(pieces, attns):
p = html.escape(p)
c = str(_value2rgba(a, alpha_mult=0.5, **kwargs))
spans.append(
f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>'
)
html_code.append(sep.join(spans))
html_code.append("</span>")
return "".join(html_code)
def _show_piece_attn(*args, **kwargs):
from IPython.display import display, HTML
display(HTML(_piece_attn_html(*args, **kwargs)))
@lru_cache(maxsize=1024 * 2)
def _intrinsic_attention(learn, text, class_id=None):
"Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
learn.model.train()
_eval_dropouts(learn.model)
learn.model.zero_grad()
learn.model.reset()
dl = learn.dls.test_dl([text])
batch = next(iter(dl))[0]
emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
emb.retain_grad()
lstm = learn.model[0].module(emb, True)
learn.model.eval()
cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[
0
].softmax(dim=-1)
if class_id is None:
class_id = cl.argmax()
cl[0][class_id].backward()
attn = emb.grad.squeeze().abs().sum(dim=-1)
attn /= attn.max()
tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
return tok, attn
@patch
def intrinsic_attention(x: TextLearner, text: str, class_id: int = None, **kwargs):
"Shows the `intrinsic attention for `text`, optional `class_id`"
if isinstance(x, LMLearner):
raise Exception("Language models are not supported")
text, attn = _intrinsic_attention(x, text, class_id)
return _piece_attn_html(text.split(), to_np(attn), **kwargs)
labels = learn_inf.dls.vocab[1]
@lru_cache(maxsize=1024 * 2)
def predict_label(title):
*_, probs = learn_inf.predict(title)
return probs
def predict(title):
# *_, probs = learn_inf.predict(title)
probs = predict_label(title)
return learn_inf.intrinsic_attention(title), {
labels[i]: float(probs[i]) for i in range(len(labels))
}
sample_text = [
[
"Poems on various subjects. Whereto is prefixed a short essay on the structure of English verse"
],
[
"Journal of a Residence in China and the neighbouring countries from 1830 to 1833. With an introductory essay by the Hon. and Rev. Baptist Wriothesley Noel. [With a map.]"
],
["The Adventures of Oliver Twist. [With plates.]"],
["['The Adventures of Sherlock Holmes', 'Single Works']"],
[
"['Coal, Iron, and Oil; or, the Practical American miner. A plain and popular work on our mines and mineral resources ... With numerous maps and engravings, etc']"
],
[
"Summer Travelling in Iceland; being the narrative of two journeys across the island ... With a chapter on Askja by E. Delmar Morgan ... Containing also a literal translation of three sagas. Maps, etc'"
],
[
"Histoire de France au moyen aÃÇge, depuis Philippe-Auguste jusqu'aÃÄ la fin du reÃÄgne de Louis XI. 1223-1483. Troisieme eÃÅdition"
],
[
"Two Centuries of Soho: its institutions, firms, and amusements. By the Clergy of St. Anne's, Soho, J. H. Cardwell ... H. B. Freeman ... G. C. Wilton ... assisted by other contributors, etc"
],
["""A Christmas Carol"""],
]
description = """
British Library Books genre detection model
"""
article = """
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5245175.svg)](https://doi.org/10.5281/zenodo.5245175)
# British Library Books genre detection demo
This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.
## Background
This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon.
## Model description
This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection.
This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.
## Training data
The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
In particular the training data, was mostly English, and mostly from the 19th Century. You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))
## Model performance
The models performance on a held-out test set is as follows:
```
precision recall f1-score support
Fiction 0.91 0.88 0.90 296
Non-fiction 0.94 0.95 0.95 554
accuracy 0.93 850
macro avg 0.93 0.92 0.92 850
weighted avg 0.93 0.93 0.93 850
```
> Credit: This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London.
"""
gr_interface = gr.Interface(
fn=predict,
inputs=gr.inputs.Textbox(),
outputs=[
gr.outputs.HTML("Intrinsic attention"),
gr.outputs.Label(num_top_classes=len(labels), label="Confidence"),
],
title="British Library 19th Century Books Genre Classifier",
description=description,
article=article,
examples=sample_text,
allow_screenshot=True,
)
gr_interface.launch(inline=False, share=False)
from functools import lru_cache
from fastai.text.all import *
from fastcore.all import *
import matplotlib.cm as cm
import html
import gradio as gr
learn_inf = load_learner("20210928-model.pkl")
def _value2rgba(x, cmap=cm.RdYlGn, alpha_mult=1.0):
"Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`."
c = cmap(x)
rgb = (np.array(c[:-1]) * 255).astype(int)
a = c[-1] * alpha_mult
return tuple(rgb.tolist() + [a])
def _eval_dropouts(mod):
module_name = mod.__class__.__name__
if "Dropout" in module_name or "BatchNorm" in module_name:
mod.training = False
for module in mod.children():
_eval_dropouts(module)
def _piece_attn_html(pieces, attns, sep=" ", **kwargs):
html_code, spans = ['<span style="font-family: monospace;">'], []
for p, a in zip(pieces, attns):
p = html.escape(p)
c = str(_value2rgba(a, alpha_mult=0.5, **kwargs))
spans.append(
f'<span title="{a:.3f}" style="background-color: rgba{c};">{p}</span>'
)
html_code.append(sep.join(spans))
html_code.append("</span>")
return "".join(html_code)
def _show_piece_attn(*args, **kwargs):
from IPython.display import display, HTML
display(HTML(_piece_attn_html(*args, **kwargs)))
@lru_cache(maxsize=1024 * 2)
def _intrinsic_attention(learn, text, class_id=None):
"Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
learn.model.train()
_eval_dropouts(learn.model)
learn.model.zero_grad()
learn.model.reset()
dl = learn.dls.test_dl([text])
batch = next(iter(dl))[0]
emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
emb.retain_grad()
lstm = learn.model[0].module(emb, True)
learn.model.eval()
cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[
0
].softmax(dim=-1)
if class_id is None:
class_id = cl.argmax()
cl[0][class_id].backward()
attn = emb.grad.squeeze().abs().sum(dim=-1)
attn /= attn.max()
tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
return tok, attn
@patch
def intrinsic_attention(x: TextLearner, text: str, class_id: int = None, **kwargs):
"Shows the `intrinsic attention for `text`, optional `class_id`"
if isinstance(x, LMLearner):
raise Exception("Language models are not supported")
text, attn = _intrinsic_attention(x, text, class_id)
return _piece_attn_html(text.split(), to_np(attn), **kwargs)
labels = learn_inf.dls.vocab[1]
@lru_cache(maxsize=1024 * 2)
def predict_label(title):
*_, probs = learn_inf.predict(title)
return probs
def predict(title):
# *_, probs = learn_inf.predict(title)
probs = predict_label(title)
return learn_inf.intrinsic_attention(title), {
labels[i]: float(probs[i]) for i in range(len(labels))
}
sample_text = [
[
"Poems on various subjects. Whereto is prefixed a short essay on the structure of English verse"
],
[
"Journal of a Residence in China and the neighbouring countries from 1830 to 1833. With an introductory essay by the Hon. and Rev. Baptist Wriothesley Noel. [With a map.]"
],
["The Adventures of Oliver Twist. [With plates.]"],
["['The Adventures of Sherlock Holmes', 'Single Works']"],
[
"['Coal, Iron, and Oil; or, the Practical American miner. A plain and popular work on our mines and mineral resources ... With numerous maps and engravings, etc']"
],
[
"Summer Travelling in Iceland; being the narrative of two journeys across the island ... With a chapter on Askja by E. Delmar Morgan ... Containing also a literal translation of three sagas. Maps, etc'"
],
[
"Histoire de France au moyen aÃÇge, depuis Philippe-Auguste jusqu'aÃÄ la fin du reÃÄgne de Louis XI. 1223-1483. Troisieme eÃÅdition"
],
[
"Two Centuries of Soho: its institutions, firms, and amusements. By the Clergy of St. Anne's, Soho, J. H. Cardwell ... H. B. Freeman ... G. C. Wilton ... assisted by other contributors, etc"
],
["""A Christmas Carol"""],
]
description = """
British Library Books genre detection model
"""
article = """
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5245175.svg)](https://doi.org/10.5281/zenodo.5245175)
# British Library Books genre detection demo
This demo alows you to play with a 'genre' detection model which has been trained to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'.
The model was trained with the [fastai](https://docs.fast.ai/) library on training data drawn from ditised books at the British Library. These Books are mainly from the 19th Century.
The demo also shows you which parts of the input the model is using most to make its prediction. You can hover over the words to see the attenton score assigned to that word. This gives you some sense of which words are important to the model in making a prediction.
The examples include titles from the BL books collection.
## Background
This model was developed as part of work by the [Living with Machines](https://livingwithmachines.ac.uk/). The process of training the model and working with the data is documented in a tutorial which will be released soon.
## Model description
This model is intended to predict, from the title of a book, whether it is 'fiction' or 'non-fiction'. This model was trained on data created from the [Digitised printed books (18th-19th Century)](https://www.bl.uk/collection-guides/digitised-printed-books) book collection.
This dataset is dominated by English language books though it includes books in several other languages in much smaller numbers. This model was originally developed for use as part of the Living with Machines project to be able to 'segment' this large dataset of books into different categories based on a 'crude' classification of genre i.e. whether the title was `fiction` or `non-fiction`.
## Training data
The model is trained on a particular collection of books digitised by the British Library. As a result the model may do less well on titles that look different to this data.
In particular the training data, was mostly English, and mostly from the 19th Century. You can find more information about the model [here]((https://doi.org/10.5281/zenodo.5245175))
## Model performance
The models performance on a held-out test set is as follows:
```
precision recall f1-score support
Fiction 0.91 0.88 0.90 296
Non-fiction 0.94 0.95 0.95 554
accuracy 0.93 850
macro avg 0.93 0.92 0.92 850
weighted avg 0.93 0.93 0.93 850
```
> Credits: This work was partly supported by [Living with Machines](https://livingwithmachines.ac.uk/). This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London.
> Code for showing attention was adapted from Zach Mueller's (@TheZachMueller) [fastinference](https://muellerzr.github.io/fastinference/) library.
"""
gr_interface = gr.Interface(
fn=predict,
inputs=gr.inputs.Textbox(),
outputs=[
gr.outputs.HTML("Intrinsic attention"),
gr.outputs.Label(num_top_classes=len(labels), label="Confidence"),
],
title="British Library 19th Century Books Genre Classifier",
description=description,
article=article,
examples=sample_text,
allow_screenshot=True,
)
gr_interface.launch(inline=False, share=False)
|