Dean commited on
Commit
c6912f8
·
2 Parent(s): a7c7fdd 8d1f074

Merge commit '8d1f074f67512e839e8d290ade59fc8fe73f7c9c' into fix-mlflow

Browse files

Incorporating latest version from Gagan

# Conflicts:
# dvc.yaml
# reports/evaluation_metrics.txt
# src/data/process_data.py
# src/models/evaluate_model.py
# src/models/model.py
# src/models/train_model.py

Makefile CHANGED
@@ -35,6 +35,7 @@ clean:
35
  ## Lint using flake8
36
  lint:
37
  flake8 src
 
38
 
39
  ## Upload Data to default DVC remote
40
  push:
 
35
  ## Lint using flake8
36
  lint:
37
  flake8 src
38
+ black src
39
 
40
  ## Upload Data to default DVC remote
41
  push:
dvc.yaml CHANGED
@@ -32,8 +32,6 @@ stages:
32
  outs:
33
  - models:
34
  persist: true
35
- - reports/training_params.yml:
36
- cache: false
37
  metrics:
38
  - reports/training_metrics.csv:
39
  cache: false
@@ -45,6 +43,15 @@ stages:
45
  - models
46
  - src/models/evaluate_model.py
47
  metrics:
48
- - reports/metrics.csv:
 
 
 
 
 
 
 
 
 
49
  cache: false
50
 
 
32
  outs:
33
  - models:
34
  persist: true
 
 
35
  metrics:
36
  - reports/training_metrics.csv:
37
  cache: false
 
43
  - models
44
  - src/models/evaluate_model.py
45
  metrics:
46
+ - reports/evaluation_metrics.csv:
47
+ cache: false
48
+ visualize:
49
+ cmd: streamlit run src/visualization/visualize.py
50
+ deps:
51
+ - models
52
+ - src/visualization/visualize.py
53
+ - params.yml
54
+ metrics:
55
+ - reports/visualization_metrics.csv:
56
  cache: false
57
 
params.yml CHANGED
@@ -1,3 +1,4 @@
 
1
  data: cnn_dailymail
2
  batch_size: 2
3
  num_workers: 2
@@ -8,5 +9,8 @@ epochs: 5
8
  source_dir: src
9
  model_dir: models
10
  metric: rouge
11
- split: 0.02
12
- use_gpu: True
 
 
 
 
1
+ name: summarsiation
2
  data: cnn_dailymail
3
  batch_size: 2
4
  num_workers: 2
 
9
  source_dir: src
10
  model_dir: models
11
  metric: rouge
12
+ split: 0.001
13
+ use_gpu: True
14
+ visualise: True
15
+ hf_username: gagan3012
16
+ upload_to_hf: True
reports/{metrics.csv → evaluation_metrics.csv} RENAMED
File without changes
src/__init__.py CHANGED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os # noqa: F401
2
+ import sys # noqa: F401
3
+
4
+ from src.data.make_dataset import make_dataset # noqa: F401
5
+ from src.data.process_data import process_data # noqa: F401
6
+ from src.models.evaluate_model import evaluate_model # noqa: F401
7
+ from src.models.model import Summarization # noqa: F401
8
+ from src.models.predict_model import predict_model # noqa: F401
9
+ from src.models.train_model import train_model # noqa: F401
10
+ from src.visualization.visualize import visualize # noqa: F401
11
+
12
+ sys.path.append(os.path.dirname(os.path.realpath(__file__))) # noqa: F401
src/data/make_dataset.py CHANGED
@@ -5,22 +5,21 @@ import os
5
  import pprint
6
 
7
 
8
-
9
- def make_dataset(dataset='cnn_dailymail', split='train'):
10
  """make dataset for summarisation"""
11
- if not os.path.exists('data/raw'):
12
- os.makedirs('data/raw')
13
- dataset = load_dataset(dataset, '3.0.0', split=split)
14
  df = pd.DataFrame()
15
- df['article'] = dataset['article']
16
- df['highlights'] = dataset['highlights']
17
- df.to_csv('data/raw/{}.csv'.format(split))
18
 
19
 
20
- if __name__ == '__main__':
21
  with open("params.yml") as f:
22
  params = yaml.safe_load(f)
23
  pprint.pprint(params)
24
- make_dataset(dataset=params['data'], split='train')
25
- make_dataset(dataset=params['data'], split='test')
26
- make_dataset(dataset=params['data'], split='validation')
 
5
  import pprint
6
 
7
 
8
+ def make_dataset(dataset="cnn_dailymail", split="train"):
 
9
  """make dataset for summarisation"""
10
+ if not os.path.exists("data/raw"):
11
+ os.makedirs("data/raw")
12
+ dataset = load_dataset(dataset, "3.0.0", split=split)
13
  df = pd.DataFrame()
14
+ df["article"] = dataset["article"]
15
+ df["highlights"] = dataset["highlights"]
16
+ df.to_csv("data/raw/{}.csv".format(split))
17
 
18
 
19
+ if __name__ == "__main__":
20
  with open("params.yml") as f:
21
  params = yaml.safe_load(f)
22
  pprint.pprint(params)
23
+ make_dataset(dataset=params["data"], split="train")
24
+ make_dataset(dataset=params["data"], split="test")
25
+ make_dataset(dataset=params["data"], split="validation")
src/data/process_data.py CHANGED
@@ -3,18 +3,18 @@ import yaml
3
  import os
4
 
5
 
6
- def process_data(split='train'):
7
 
8
  with open("params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
- df = pd.read_csv('data/raw/{}.csv'.format(split))
12
- df.columns = ['Unnamed: 0', 'input_text', 'output_text']
13
- df = df.sample(frac=params['split'], replace=True, random_state=1)
14
- df.to_csv('data/processed/{}.csv'.format(split))
15
 
16
 
17
- if __name__ == '__main__':
18
- process_data(split='train')
19
- process_data(split='test')
20
- process_data(split='validation')
 
3
  import os
4
 
5
 
6
+ def process_data(split="train"):
7
 
8
  with open("params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
+ df = pd.read_csv("data/raw/{}.csv".format(split))
12
+ df.columns = ["Unnamed: 0", "input_text", "output_text"]
13
+ df = df.sample(frac=params["split"], replace=True, random_state=1)
14
+ df.to_csv("data/processed/{}.csv".format(split))
15
 
16
 
17
+ if __name__ == "__main__":
18
+ process_data(split="train")
19
+ process_data(split="test")
20
+ process_data(split="validation")
src/models/__init__.py CHANGED
@@ -1 +1,4 @@
1
- from .model import Summarization
 
 
 
 
1
+ from .model import Summarization # noqa: F401
2
+ from .train_model import train_model # noqa: F401
3
+ from .predict_model import predict_model # noqa: F401
4
+ from .evaluate_model import evaluate_model # noqa: F401
src/models/evaluate_model.py CHANGED
@@ -13,14 +13,14 @@ def evaluate_model():
13
  with open("params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
- test_df = pd.read_csv('data/processed/test.csv')[:25]
17
  model = Summarization()
18
- model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
19
- results = model.evaluate(test_df=test_df, metrics=params['metric'])
20
 
21
- with dagshub_logger(metrics_path='reports/metrics.csv', should_log_hparams=False) as logger:
22
  logger.log_metrics(results)
23
 
24
 
25
- if __name__ == '__main__':
26
  evaluate_model()
 
13
  with open("params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
+ test_df = pd.read_csv("data/processed/test.csv")[:25]
17
  model = Summarization()
18
+ model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
19
+ results = model.evaluate(test_df=test_df, metrics=params["metric"])
20
 
21
+ with dagshub_logger(metrics_path='reports/evaluation_metrics.csv', should_log_hparams=False) as logger:
22
  logger.log_metrics(results)
23
 
24
 
25
+ if __name__ == "__main__":
26
  evaluate_model()
src/models/model.py CHANGED
@@ -1,9 +1,17 @@
 
 
 
 
1
  import torch
2
  import pandas as pd
 
3
  from transformers import (
4
  AdamW,
5
  T5ForConditionalGeneration,
6
- T5TokenizerFast as T5Tokenizer, MT5Tokenizer, MT5ForConditionalGeneration, ByT5Tokenizer,
 
 
 
7
  )
8
  from torch.utils.data import Dataset, DataLoader
9
  import pytorch_lightning as pl
@@ -27,11 +35,11 @@ class DataModule(Dataset):
27
  """
28
 
29
  def __init__(
30
- self,
31
- data: pd.DataFrame,
32
- tokenizer: T5Tokenizer,
33
- source_max_token_len: int = 512,
34
- target_max_token_len: int = 512,
35
  ):
36
  """
37
  :param data:
@@ -71,9 +79,7 @@ class DataModule(Dataset):
71
  )
72
 
73
  labels = output_encoding["input_ids"]
74
- labels[
75
- labels == 0
76
- ] = -100
77
 
78
  return dict(
79
  keywords=data_row["input_text"],
@@ -87,15 +93,15 @@ class DataModule(Dataset):
87
 
88
  class PLDataModule(LightningDataModule):
89
  def __init__(
90
- self,
91
- train_df: pd.DataFrame,
92
- test_df: pd.DataFrame,
93
- tokenizer: T5Tokenizer,
94
- source_max_token_len: int = 512,
95
- target_max_token_len: int = 512,
96
- batch_size: int = 4,
97
- split: float = 0.1,
98
- num_workers: int = 2
99
  ):
100
  """
101
  :param data_df:
@@ -130,28 +136,45 @@ class PLDataModule(LightningDataModule):
130
  )
131
 
132
  def train_dataloader(self):
133
- """ training dataloader """
134
  return DataLoader(
135
- self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers
 
 
 
136
  )
137
 
138
  def test_dataloader(self):
139
- """ test dataloader """
140
  return DataLoader(
141
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
 
 
 
142
  )
143
 
144
  def val_dataloader(self):
145
- """ validation dataloader """
146
  return DataLoader(
147
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
 
 
 
148
  )
149
 
150
 
151
  class LightningModel(LightningModule):
152
- """ PyTorch Lightning Model class"""
153
 
154
- def __init__(self, tokenizer, model, learning_rate, adam_epsilon, weight_decay, output: str = "outputs"):
 
 
 
 
 
 
 
 
155
  """
156
  initiates a PyTorch Lightning Model
157
  Args:
@@ -168,7 +191,7 @@ class LightningModel(LightningModule):
168
  self.weight_decay = weight_decay
169
 
170
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
171
- """ forward step """
172
  output = self.model(
173
  input_ids,
174
  attention_mask=attention_mask,
@@ -179,7 +202,7 @@ class LightningModel(LightningModule):
179
  return output.loss, output.logits
180
 
181
  def training_step(self, batch, batch_size):
182
- """ training step """
183
  input_ids = batch["keywords_input_ids"]
184
  attention_mask = batch["keywords_attention_mask"]
185
  labels = batch["labels"]
@@ -195,7 +218,7 @@ class LightningModel(LightningModule):
195
  return loss
196
 
197
  def validation_step(self, batch, batch_size):
198
- """ validation step """
199
  input_ids = batch["keywords_input_ids"]
200
  attention_mask = batch["keywords_attention_mask"]
201
  labels = batch["labels"]
@@ -211,7 +234,7 @@ class LightningModel(LightningModule):
211
  return loss
212
 
213
  def test_step(self, batch, batch_size):
214
- """ test step """
215
  input_ids = batch["keywords_input_ids"]
216
  attention_mask = batch["keywords_attention_mask"]
217
  labels = batch["labels"]
@@ -228,29 +251,41 @@ class LightningModel(LightningModule):
228
  return loss
229
 
230
  def configure_optimizers(self):
231
- """ configure optimizers """
232
  model = self.model
233
  no_decay = ["bias", "LayerNorm.weight"]
234
  optimizer_grouped_parameters = [
235
  {
236
- "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
 
 
 
 
237
  "weight_decay": self.weight_decay,
238
  },
239
  {
240
- "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
 
 
 
 
241
  "weight_decay": 0.0,
242
  },
243
  ]
244
- optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
 
 
 
 
245
  self.opt = optimizer
246
  return [optimizer]
247
 
248
 
249
  class Summarization:
250
- """ Custom Summarization class """
251
 
252
  def __init__(self) -> None:
253
- """ initiates Summarization class """
254
  pass
255
 
256
  def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
@@ -277,20 +312,20 @@ class Summarization:
277
  )
278
 
279
  def train(
280
- self,
281
- train_df: pd.DataFrame,
282
- eval_df: pd.DataFrame,
283
- source_max_token_len: int = 512,
284
- target_max_token_len: int = 512,
285
- batch_size: int = 8,
286
- max_epochs: int = 5,
287
- use_gpu: bool = True,
288
- outputdir: str = "models",
289
- early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
290
- learning_rate: float = 0.0001,
291
- adam_epsilon: float = 0.01,
292
- num_workers: int = 2,
293
- weight_decay: float = 0.0001
294
  ):
295
  """
296
  trains T5/MT5 model on custom dataset
@@ -322,8 +357,12 @@ class Summarization:
322
  )
323
 
324
  self.T5Model = LightningModel(
325
- tokenizer=self.tokenizer, model=self.model, output=outputdir,
326
- learning_rate=learning_rate, adam_epsilon=adam_epsilon, weight_decay=weight_decay
 
 
 
 
327
  )
328
 
329
  logger = DAGsHubLogger(metrics_path='reports/training_metrics.csv',
@@ -359,7 +398,7 @@ class Summarization:
359
  trainer.fit(self.T5Model, self.data_module)
360
 
361
  def load_model(
362
- self, model_type: str = 't5', model_dir: str = "models", use_gpu: bool = False
363
  ):
364
  """
365
  loads a checkpoint for inferencing/prediction
@@ -388,16 +427,15 @@ class Summarization:
388
  if torch.cuda.is_available():
389
  self.device = torch.device("cuda")
390
  else:
391
- raise Exception("exception ---> no gpu found. set use_gpu=False, to use CPU")
 
 
392
  else:
393
  self.device = torch.device("cpu")
394
 
395
  self.model = self.model.to(self.device)
396
 
397
- def save_model(
398
- self,
399
- model_dir="models"
400
- ):
401
  """
402
  Save model to dir
403
  :param model_dir:
@@ -408,19 +446,19 @@ class Summarization:
408
  self.model.save_pretrained(path)
409
 
410
  def predict(
411
- self,
412
- source_text: str,
413
- max_length: int = 512,
414
- num_return_sequences: int = 1,
415
- num_beams: int = 2,
416
- top_k: int = 50,
417
- top_p: float = 0.95,
418
- do_sample: bool = True,
419
- repetition_penalty: float = 2.5,
420
- length_penalty: float = 1.0,
421
- early_stopping: bool = True,
422
- skip_special_tokens: bool = True,
423
- clean_up_tokenization_spaces: bool = True,
424
  ):
425
  """
426
  generates prediction for T5/MT5 model
@@ -463,14 +501,10 @@ class Summarization:
463
  )
464
  return preds
465
 
466
- def evaluate(
467
- self,
468
- test_df: pd.DataFrame,
469
- metrics: str = "rouge"
470
- ):
471
  metric = load_metric(metrics)
472
- input_text = test_df['input_text']
473
- references = test_df['output_text']
474
  references = references.to_list()
475
 
476
  predictions = [self.predict(x) for x in tqdm(input_text)]
@@ -478,49 +512,69 @@ class Summarization:
478
  results = metric.compute(predictions=predictions, references=references)
479
 
480
  output = {
481
- 'Rouge 1': {
482
- 'Rouge_1 Low Precision': results["rouge1"].low.precision,
483
- 'Rouge_1 Low recall': results["rouge1"].low.recall,
484
- 'Rouge_1 Low F1': results["rouge1"].low.fmeasure,
485
- 'Rouge_1 Mid Precision': results["rouge1"].mid.precision,
486
- 'Rouge_1 Mid recall': results["rouge1"].mid.recall,
487
- 'Rouge_1 Mid F1': results["rouge1"].mid.fmeasure,
488
- 'Rouge_1 High Precision': results["rouge1"].high.precision,
489
- 'Rouge_1 High recall': results["rouge1"].high.recall,
490
- 'Rouge_1 High F1': results["rouge1"].high.fmeasure,
491
- },
492
- 'Rouge 2': {
493
- 'Rouge_2 Low Precision': results["rouge2"].low.precision,
494
- 'Rouge_2 Low recall': results["rouge2"].low.recall,
495
- 'Rouge_2 Low F1': results["rouge2"].low.fmeasure,
496
- 'Rouge_2 Mid Precision': results["rouge2"].mid.precision,
497
- 'Rouge_2 Mid recall': results["rouge2"].mid.recall,
498
- 'Rouge_2 Mid F1': results["rouge2"].mid.fmeasure,
499
- 'Rouge_2 High Precision': results["rouge2"].high.precision,
500
- 'Rouge_2 High recall': results["rouge2"].high.recall,
501
- 'Rouge_2 High F1': results["rouge2"].high.fmeasure,
502
- },
503
- 'Rouge L': {
504
- 'Rouge_L Low Precision': results["rougeL"].low.precision,
505
- 'Rouge_L Low recall': results["rougeL"].low.recall,
506
- 'Rouge_L Low F1': results["rougeL"].low.fmeasure,
507
- 'Rouge_L Mid Precision': results["rougeL"].mid.precision,
508
- 'Rouge_L Mid recall': results["rougeL"].mid.recall,
509
- 'Rouge_L Mid F1': results["rougeL"].mid.fmeasure,
510
- 'Rouge_L High Precision': results["rougeL"].high.precision,
511
- 'Rouge_L High recall': results["rougeL"].high.recall,
512
- 'Rouge_L High F1': results["rougeL"].high.fmeasure,
513
- },
514
- 'rougeLsum': {
515
- 'rougeLsum Low Precision': results["rougeLsum"].low.precision,
516
- 'rougeLsum Low recall': results["rougeLsum"].low.recall,
517
- 'rougeLsum Low F1': results["rougeLsum"].low.fmeasure,
518
- 'rougeLsum Mid Precision': results["rougeLsum"].mid.precision,
519
- 'rougeLsum Mid recall': results["rougeLsum"].mid.recall,
520
- 'rougeLsum Mid F1': results["rougeLsum"].mid.fmeasure,
521
- 'rougeLsum High Precision': results["rougeLsum"].high.precision,
522
- 'rougeLsum High recall': results["rougeLsum"].high.recall,
523
- 'rougeLsum High F1': results["rougeLsum"].high.fmeasure,
524
- }
525
  }
526
  return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ from getpass import getpass
3
+ from pathlib import Path
4
+
5
  import torch
6
  import pandas as pd
7
+ from huggingface_hub import HfApi, Repository
8
  from transformers import (
9
  AdamW,
10
  T5ForConditionalGeneration,
11
+ T5TokenizerFast as T5Tokenizer,
12
+ MT5Tokenizer,
13
+ MT5ForConditionalGeneration,
14
+ ByT5Tokenizer,
15
  )
16
  from torch.utils.data import Dataset, DataLoader
17
  import pytorch_lightning as pl
 
35
  """
36
 
37
  def __init__(
38
+ self,
39
+ data: pd.DataFrame,
40
+ tokenizer: T5Tokenizer,
41
+ source_max_token_len: int = 512,
42
+ target_max_token_len: int = 512,
43
  ):
44
  """
45
  :param data:
 
79
  )
80
 
81
  labels = output_encoding["input_ids"]
82
+ labels[labels == 0] = -100
 
 
83
 
84
  return dict(
85
  keywords=data_row["input_text"],
 
93
 
94
  class PLDataModule(LightningDataModule):
95
  def __init__(
96
+ self,
97
+ train_df: pd.DataFrame,
98
+ test_df: pd.DataFrame,
99
+ tokenizer: T5Tokenizer,
100
+ source_max_token_len: int = 512,
101
+ target_max_token_len: int = 512,
102
+ batch_size: int = 4,
103
+ split: float = 0.1,
104
+ num_workers: int = 2,
105
  ):
106
  """
107
  :param data_df:
 
136
  )
137
 
138
  def train_dataloader(self):
139
+ """training dataloader"""
140
  return DataLoader(
141
+ self.train_dataset,
142
+ batch_size=self.batch_size,
143
+ shuffle=True,
144
+ num_workers=self.num_workers,
145
  )
146
 
147
  def test_dataloader(self):
148
+ """test dataloader"""
149
  return DataLoader(
150
+ self.test_dataset,
151
+ batch_size=self.batch_size,
152
+ shuffle=False,
153
+ num_workers=self.num_workers,
154
  )
155
 
156
  def val_dataloader(self):
157
+ """validation dataloader"""
158
  return DataLoader(
159
+ self.test_dataset,
160
+ batch_size=self.batch_size,
161
+ shuffle=False,
162
+ num_workers=self.num_workers,
163
  )
164
 
165
 
166
  class LightningModel(LightningModule):
167
+ """PyTorch Lightning Model class"""
168
 
169
+ def __init__(
170
+ self,
171
+ tokenizer,
172
+ model,
173
+ learning_rate,
174
+ adam_epsilon,
175
+ weight_decay,
176
+ output: str = "outputs",
177
+ ):
178
  """
179
  initiates a PyTorch Lightning Model
180
  Args:
 
191
  self.weight_decay = weight_decay
192
 
193
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
194
+ """forward step"""
195
  output = self.model(
196
  input_ids,
197
  attention_mask=attention_mask,
 
202
  return output.loss, output.logits
203
 
204
  def training_step(self, batch, batch_size):
205
+ """training step"""
206
  input_ids = batch["keywords_input_ids"]
207
  attention_mask = batch["keywords_attention_mask"]
208
  labels = batch["labels"]
 
218
  return loss
219
 
220
  def validation_step(self, batch, batch_size):
221
+ """validation step"""
222
  input_ids = batch["keywords_input_ids"]
223
  attention_mask = batch["keywords_attention_mask"]
224
  labels = batch["labels"]
 
234
  return loss
235
 
236
  def test_step(self, batch, batch_size):
237
+ """test step"""
238
  input_ids = batch["keywords_input_ids"]
239
  attention_mask = batch["keywords_attention_mask"]
240
  labels = batch["labels"]
 
251
  return loss
252
 
253
  def configure_optimizers(self):
254
+ """configure optimizers"""
255
  model = self.model
256
  no_decay = ["bias", "LayerNorm.weight"]
257
  optimizer_grouped_parameters = [
258
  {
259
+ "params": [
260
+ p
261
+ for n, p in model.named_parameters()
262
+ if not any(nd in n for nd in no_decay)
263
+ ],
264
  "weight_decay": self.weight_decay,
265
  },
266
  {
267
+ "params": [
268
+ p
269
+ for n, p in model.named_parameters()
270
+ if any(nd in n for nd in no_decay)
271
+ ],
272
  "weight_decay": 0.0,
273
  },
274
  ]
275
+ optimizer = AdamW(
276
+ optimizer_grouped_parameters,
277
+ lr=self.learning_rate,
278
+ eps=self.adam_epsilon,
279
+ )
280
  self.opt = optimizer
281
  return [optimizer]
282
 
283
 
284
  class Summarization:
285
+ """Custom Summarization class"""
286
 
287
  def __init__(self) -> None:
288
+ """initiates Summarization class"""
289
  pass
290
 
291
  def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
 
312
  )
313
 
314
  def train(
315
+ self,
316
+ train_df: pd.DataFrame,
317
+ eval_df: pd.DataFrame,
318
+ source_max_token_len: int = 512,
319
+ target_max_token_len: int = 512,
320
+ batch_size: int = 8,
321
+ max_epochs: int = 5,
322
+ use_gpu: bool = True,
323
+ outputdir: str = "models",
324
+ early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
325
+ learning_rate: float = 0.0001,
326
+ adam_epsilon: float = 0.01,
327
+ num_workers: int = 2,
328
+ weight_decay: float = 0.0001,
329
  ):
330
  """
331
  trains T5/MT5 model on custom dataset
 
357
  )
358
 
359
  self.T5Model = LightningModel(
360
+ tokenizer=self.tokenizer,
361
+ model=self.model,
362
+ output=outputdir,
363
+ learning_rate=learning_rate,
364
+ adam_epsilon=adam_epsilon,
365
+ weight_decay=weight_decay,
366
  )
367
 
368
  logger = DAGsHubLogger(metrics_path='reports/training_metrics.csv',
 
398
  trainer.fit(self.T5Model, self.data_module)
399
 
400
  def load_model(
401
+ self, model_type: str = "t5", model_dir: str = "models", use_gpu: bool = False
402
  ):
403
  """
404
  loads a checkpoint for inferencing/prediction
 
427
  if torch.cuda.is_available():
428
  self.device = torch.device("cuda")
429
  else:
430
+ raise Exception(
431
+ "exception ---> no gpu found. set use_gpu=False, to use CPU"
432
+ )
433
  else:
434
  self.device = torch.device("cpu")
435
 
436
  self.model = self.model.to(self.device)
437
 
438
+ def save_model(self, model_dir="models"):
 
 
 
439
  """
440
  Save model to dir
441
  :param model_dir:
 
446
  self.model.save_pretrained(path)
447
 
448
  def predict(
449
+ self,
450
+ source_text: str,
451
+ max_length: int = 512,
452
+ num_return_sequences: int = 1,
453
+ num_beams: int = 2,
454
+ top_k: int = 50,
455
+ top_p: float = 0.95,
456
+ do_sample: bool = True,
457
+ repetition_penalty: float = 2.5,
458
+ length_penalty: float = 1.0,
459
+ early_stopping: bool = True,
460
+ skip_special_tokens: bool = True,
461
+ clean_up_tokenization_spaces: bool = True,
462
  ):
463
  """
464
  generates prediction for T5/MT5 model
 
501
  )
502
  return preds
503
 
504
+ def evaluate(self, test_df: pd.DataFrame, metrics: str = "rouge"):
 
 
 
 
505
  metric = load_metric(metrics)
506
+ input_text = test_df["input_text"]
507
+ references = test_df["output_text"]
508
  references = references.to_list()
509
 
510
  predictions = [self.predict(x) for x in tqdm(input_text)]
 
512
  results = metric.compute(predictions=predictions, references=references)
513
 
514
  output = {
515
+ "Rouge_1 Low Precision": results["rouge1"].low.precision,
516
+ "Rouge_1 Low recall": results["rouge1"].low.recall,
517
+ "Rouge_1 Low F1": results["rouge1"].low.fmeasure,
518
+ "Rouge_1 Mid Precision": results["rouge1"].mid.precision,
519
+ "Rouge_1 Mid recall": results["rouge1"].mid.recall,
520
+ "Rouge_1 Mid F1": results["rouge1"].mid.fmeasure,
521
+ "Rouge_1 High Precision": results["rouge1"].high.precision,
522
+ "Rouge_1 High recall": results["rouge1"].high.recall,
523
+ "Rouge_1 High F1": results["rouge1"].high.fmeasure,
524
+ "Rouge_2 Low Precision": results["rouge2"].low.precision,
525
+ "Rouge_2 Low recall": results["rouge2"].low.recall,
526
+ "Rouge_2 Low F1": results["rouge2"].low.fmeasure,
527
+ "Rouge_2 Mid Precision": results["rouge2"].mid.precision,
528
+ "Rouge_2 Mid recall": results["rouge2"].mid.recall,
529
+ "Rouge_2 Mid F1": results["rouge2"].mid.fmeasure,
530
+ "Rouge_2 High Precision": results["rouge2"].high.precision,
531
+ "Rouge_2 High recall": results["rouge2"].high.recall,
532
+ "Rouge_2 High F1": results["rouge2"].high.fmeasure,
533
+ "Rouge_L Low Precision": results["rougeL"].low.precision,
534
+ "Rouge_L Low recall": results["rougeL"].low.recall,
535
+ "Rouge_L Low F1": results["rougeL"].low.fmeasure,
536
+ "Rouge_L Mid Precision": results["rougeL"].mid.precision,
537
+ "Rouge_L Mid recall": results["rougeL"].mid.recall,
538
+ "Rouge_L Mid F1": results["rougeL"].mid.fmeasure,
539
+ "Rouge_L High Precision": results["rougeL"].high.precision,
540
+ "Rouge_L High recall": results["rougeL"].high.recall,
541
+ "Rouge_L High F1": results["rougeL"].high.fmeasure,
542
+ "rougeLsum Low Precision": results["rougeLsum"].low.precision,
543
+ "rougeLsum Low recall": results["rougeLsum"].low.recall,
544
+ "rougeLsum Low F1": results["rougeLsum"].low.fmeasure,
545
+ "rougeLsum Mid Precision": results["rougeLsum"].mid.precision,
546
+ "rougeLsum Mid recall": results["rougeLsum"].mid.recall,
547
+ "rougeLsum Mid F1": results["rougeLsum"].mid.fmeasure,
548
+ "rougeLsum High Precision": results["rougeLsum"].high.precision,
549
+ "rougeLsum High recall": results["rougeLsum"].high.recall,
550
+ "rougeLsum High F1": results["rougeLsum"].high.fmeasure,
 
 
 
 
 
 
 
 
551
  }
552
  return output
553
+
554
+ def upload(self, hf_username, model_name):
555
+ hf_password = getpass("Enter your HuggingFace password")
556
+ if Path("./models").exists():
557
+ shutil.rmtree("./models")
558
+ token = HfApi().login(username=hf_username, password=hf_password)
559
+ del hf_password
560
+ model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
561
+ model_repo = Repository(
562
+ "./model",
563
+ clone_from=model_url,
564
+ use_auth_token=token,
565
+ git_email=f"{hf_username}@users.noreply.huggingface.co",
566
+ git_user=hf_username,
567
+ )
568
+
569
+ readme_txt = f"""
570
+ ---
571
+ Summarisation model {model_name}
572
+ """.strip()
573
+
574
+ (Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
575
+ self.save_model()
576
+ commit_url = model_repo.push_to_hub()
577
+
578
+ print("Check out your model at:")
579
+ print(commit_url)
580
+ print(f"https://huggingface.co/{hf_username}/{model_name}")
src/models/predict_model.py CHANGED
@@ -11,14 +11,13 @@ def predict_model(text):
11
  with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
-
15
  model = Summarization()
16
- model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
17
  pre_summary = model.predict(text)
18
  return pre_summary
19
 
20
 
21
- if __name__ == '__main__':
22
- text = pd.load_csv('data/processed/test.csv')['input_text'][0]
23
  pre_summary = predict_model(text)
24
  print(pre_summary)
 
11
  with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
 
14
  model = Summarization()
15
+ model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
16
  pre_summary = model.predict(text)
17
  return pre_summary
18
 
19
 
20
+ if __name__ == "__main__":
21
+ text = pd.load_csv("data/processed/test.csv")["input_text"][0]
22
  pre_summary = predict_model(text)
23
  print(pre_summary)
src/models/train_model.py CHANGED
@@ -12,22 +12,32 @@ def train_model():
12
  params = yaml.safe_load(f)
13
 
14
  # Load the data
15
- train_df = pd.read_csv('data/processed/train.csv')
16
- eval_df = pd.read_csv('data/processed/validation.csv')
17
 
18
- train_df = train_df.sample(frac=params['split'], replace=True, random_state=1)
19
- eval_df = eval_df.sample(frac=params['split'], replace=True, random_state=1)
20
 
21
  model = Summarization()
22
- model.from_pretrained(model_type=params['model_type'], model_name=params['model_name'])
 
 
23
 
24
- model.train(train_df=train_df, eval_df=eval_df,
25
- batch_size=params['batch_size'], max_epochs=params['epochs'],
26
- use_gpu=params['use_gpu'], learning_rate=float(params['learning_rate']),
27
- num_workers=int(params['num_workers']))
 
 
 
 
 
28
 
29
- model.save_model(model_dir=params['model_dir'])
30
 
 
 
31
 
32
- if __name__ == '__main__':
 
33
  train_model()
 
12
  params = yaml.safe_load(f)
13
 
14
  # Load the data
15
+ train_df = pd.read_csv("data/processed/train.csv")
16
+ eval_df = pd.read_csv("data/processed/validation.csv")
17
 
18
+ train_df = train_df.sample(frac=params["split"], replace=True, random_state=1)
19
+ eval_df = eval_df.sample(frac=params["split"], replace=True, random_state=1)
20
 
21
  model = Summarization()
22
+ model.from_pretrained(
23
+ model_type=params["model_type"], model_name=params["model_name"]
24
+ )
25
 
26
+ model.train(
27
+ train_df=train_df,
28
+ eval_df=eval_df,
29
+ batch_size=params["batch_size"],
30
+ max_epochs=params["epochs"],
31
+ use_gpu=params["use_gpu"],
32
+ learning_rate=float(params["learning_rate"]),
33
+ num_workers=int(params["num_workers"]),
34
+ )
35
 
36
+ model.save_model(model_dir=params["model_dir"])
37
 
38
+ if params["upload_to_hf"]:
39
+ model.upload(hf_username=params["hf_username"], model_name=params["name"])
40
 
41
+
42
+ if __name__ == "__main__":
43
  train_model()
src/visualization/visualize.py CHANGED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import yaml
3
+
4
+ from models import predict_model
5
+
6
+
7
+ def visualize():
8
+ st.write("# Summarization UI")
9
+ st.markdown(
10
+ """
11
+ *For additional questions and inquiries, please contact **Gagan Bhatia** via [LinkedIn](
12
+ https://www.linkedin.com/in/gbhatia30/) or [Github](https://github.com/gagan3012).*
13
+ """
14
+ )
15
+
16
+ text = st.text_area("Enter text here")
17
+ if st.button("Generate Summary"):
18
+ with st.spinner("Connecting the Dots..."):
19
+ sumtext = predict_model(text=text)
20
+ st.write("# Generated Summary:")
21
+ st.write("{}".format(sumtext))
22
+ with open("reports/visualization_metrics.txt", "w") as file1:
23
+ file1.writelines(text)
24
+ file1.writelines(sumtext)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ with open("params.yml") as f:
29
+ params = yaml.safe_load(f)
30
+
31
+ if params["visualise"]:
32
+ visualize()