gagan3012 commited on
Commit
f9cfbca
·
1 Parent(s): 9988244
Files changed (2) hide show
  1. dvc.yaml +19 -11
  2. src/data/process_data.py +9 -2
dvc.yaml CHANGED
@@ -1,4 +1,22 @@
1
  stages:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  train:
3
  cmd: python src/models/train_model.py
4
  deps:
@@ -20,14 +38,4 @@ stages:
20
  metrics:
21
  - reports/metrics.txt:
22
  cache: false
23
- process_data:
24
- cmd: python src/data/make_dataset.py
25
- deps:
26
- - src/data/make_dataset.py
27
- outs:
28
- - data/processed/test.csv:
29
- persist: true
30
- - data/processed/train.csv:
31
- persist: true
32
- - data/processed/validation.csv:
33
- persist: true
 
1
  stages:
2
+ create_data:
3
+ cmd: src/data/make_dataset.py
4
+ deps:
5
+ - src/data/make_dataset.py
6
+ outs:
7
+ - data/raw:
8
+ persist: true
9
+ process_data:
10
+ cmd: python src/data/process_data.py
11
+ deps:
12
+ - src/data/process_data.py
13
+ outs:
14
+ - data/processed/test.csv:
15
+ persist: true
16
+ - data/processed/train.csv:
17
+ persist: true
18
+ - data/processed/validation.csv:
19
+ persist: true
20
  train:
21
  cmd: python src/models/train_model.py
22
  deps:
 
38
  metrics:
39
  - reports/metrics.txt:
40
  cache: false
41
+
 
 
 
 
 
 
 
 
 
 
src/data/process_data.py CHANGED
@@ -1,8 +1,15 @@
1
  import pandas as pd
2
 
 
3
  def process_data(split='train'):
4
- df= pd.DataFrame()
5
  dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
6
  df['article'] = dataset['article']
7
  df['highlights'] = dataset['highlights']
8
- df.to_csv('summarization/data/processed/{}.csv'.format(split))
 
 
 
 
 
 
 
1
  import pandas as pd
2
 
3
+
4
  def process_data(split='train'):
5
+ df = pd.DataFrame()
6
  dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
7
  df['article'] = dataset['article']
8
  df['highlights'] = dataset['highlights']
9
+ df.to_csv('summarization/data/processed/{}.csv'.format(split))
10
+
11
+
12
+ if __name__ == '__name__':
13
+ process_data(split='train')
14
+ process_data(split='test')
15
+ process_data(split='validation')