Spaces:
Runtime error
Runtime error
Pipeline updates
Browse files- .dvc/config +2 -0
- dvc.yaml +0 -1
- src/data/make_dataset.py +8 -6
- src/models/model.py +3 -3
.dvc/config
CHANGED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
['remote "origin"']
|
2 |
+
url = https://dagshub.com/gagan3012/summarization.dvc
|
dvc.yaml
CHANGED
@@ -22,7 +22,6 @@ stages:
|
|
22 |
process_data:
|
23 |
cmd: python src/data/make_dataset.py
|
24 |
deps:
|
25 |
-
- data/raw
|
26 |
- src/data/make_dataset.py
|
27 |
outs:
|
28 |
- data/processed:
|
|
|
22 |
process_data:
|
23 |
cmd: python src/data/make_dataset.py
|
24 |
deps:
|
|
|
25 |
- src/data/make_dataset.py
|
26 |
outs:
|
27 |
- data/processed:
|
src/data/make_dataset.py
CHANGED
@@ -2,14 +2,16 @@ from datasets import load_dataset
|
|
2 |
import pandas as pd
|
3 |
|
4 |
|
5 |
-
def make_dataset(dataset='cnn_dailymail', split='train'
|
6 |
"""make dataset for summarisation"""
|
7 |
-
dataset = load_dataset(dataset, split=split
|
8 |
df = pd.DataFrame()
|
9 |
-
df['input_text'] = dataset['
|
10 |
-
df['output_text'] = dataset['
|
11 |
-
|
12 |
|
13 |
|
14 |
if __name__ == '__main__':
|
15 |
-
make_dataset(dataset='cnn_dailymail', split='train'
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
|
4 |
|
5 |
+
def make_dataset(dataset='cnn_dailymail', split='train'):
|
6 |
"""make dataset for summarisation"""
|
7 |
+
dataset = load_dataset(dataset, '3.0.0', split=split)
|
8 |
df = pd.DataFrame()
|
9 |
+
df['input_text'] = dataset['article']
|
10 |
+
df['output_text'] = dataset['highlights']
|
11 |
+
df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split, split))
|
12 |
|
13 |
|
14 |
if __name__ == '__main__':
|
15 |
+
make_dataset(dataset='cnn_dailymail', split='train')
|
16 |
+
make_dataset(dataset='cnn_dailymail', split='test')
|
17 |
+
make_dataset(dataset='cnn_dailymail', split='validation')
|
src/models/model.py
CHANGED
@@ -303,9 +303,9 @@ class Summarization:
|
|
303 |
tokenizer=self.tokenizer, model=self.model, output=outputdir
|
304 |
)
|
305 |
|
306 |
-
|
307 |
|
308 |
-
logger = DAGsHubLogger()
|
309 |
|
310 |
early_stop_callback = (
|
311 |
[
|
@@ -324,7 +324,7 @@ class Summarization:
|
|
324 |
gpus = 1 if use_gpu else 0
|
325 |
|
326 |
trainer = Trainer(
|
327 |
-
logger=logger,
|
328 |
callbacks=early_stop_callback,
|
329 |
max_epochs=max_epochs,
|
330 |
gpus=gpus,
|
|
|
303 |
tokenizer=self.tokenizer, model=self.model, output=outputdir
|
304 |
)
|
305 |
|
306 |
+
MLlogger = MLFlowLogger(experiment_name="Summarization",tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
|
307 |
|
308 |
+
logger = DAGsHubLogger(metrics_path='reports/metrics.txt')
|
309 |
|
310 |
early_stop_callback = (
|
311 |
[
|
|
|
324 |
gpus = 1 if use_gpu else 0
|
325 |
|
326 |
trainer = Trainer(
|
327 |
+
logger=[logger,MLlogger],
|
328 |
callbacks=early_stop_callback,
|
329 |
max_epochs=max_epochs,
|
330 |
gpus=gpus,
|