sagawa commited on
Commit
3e767a6
·
1 Parent(s): ac2b5c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -118
app.py CHANGED
@@ -54,129 +54,129 @@ class CFG():
54
  num_workers=1
55
 
56
  if st.button('predict'):
57
- st.progress(0)
58
-
59
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
60
-
61
-
62
- def seed_everything(seed=42):
63
- random.seed(seed)
64
- os.environ['PYTHONHASHSEED'] = str(seed)
65
- np.random.seed(seed)
66
- torch.manual_seed(seed)
67
- torch.cuda.manual_seed(seed)
68
- torch.backends.cudnn.deterministic = True
69
- seed_everything(seed=CFG.seed)
70
-
71
- CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
72
-
73
- def prepare_input(cfg, text):
74
- inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
75
- for k, v in inputs.items():
76
- inputs[k] = torch.tensor(v, dtype=torch.long)
77
-
78
- return inputs
79
-
80
- class TestDataset(Dataset):
81
- def __init__(self, cfg, df):
82
- self.cfg = cfg
83
- self.inputs = df['input'].values
84
-
85
- def __len__(self):
86
- return len(self.inputs)
87
 
88
- def __getitem__(self, item):
89
- inputs = prepare_input(self.cfg, self.inputs[item])
 
 
90
 
91
  return inputs
92
 
93
-
94
- class RegressionModel(nn.Module):
95
- def __init__(self, cfg, config_path=None, pretrained=False):
96
- super().__init__()
97
- self.cfg = cfg
98
- if config_path is None:
99
- self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
100
- else:
101
- self.config = torch.load(config_path)
102
- if pretrained:
103
- if 't5' in cfg.model:
104
- self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
 
 
 
 
 
 
 
 
105
  else:
106
- self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
107
- else:
108
- if 't5' in cfg.model:
109
- self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5')
 
 
110
  else:
111
- self.model = AutoModel.from_config(self.config)
112
- self.model.resize_token_embeddings(len(cfg.tokenizer))
113
- self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
114
- self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
115
- self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
116
- self.fc2 = nn.Linear(self.config.hidden_size, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- def forward(self, inputs):
119
- outputs = self.model(**inputs)
120
- last_hidden_states = outputs[0]
121
- output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
122
- output = self.fc2(self.fc_dropout2(output))
123
- return output
124
-
125
-
126
-
127
- def inference_fn(test_loader, model, device):
128
- preds = []
129
- model.eval()
130
- model.to(device)
131
- tk0 = enumerate(test_loader)
132
- for i, inputs in tk0:
133
- for k, v in inputs.items():
134
- inputs[k] = v.to(device)
135
- with torch.no_grad():
136
- y_preds = model(inputs)
137
- st.progress((i+1)*CFG.batch_size)
138
- preds.append(y_preds.to('cpu').numpy())
139
- predictions = np.concatenate(preds)
140
- return predictions
141
-
142
- model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
143
- state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
144
- model.load_state_dict(state)
145
-
146
-
147
- if CFG.uploaded_file is not None:
148
- test_ds = pd.read_csv(CFG.uploaded_file)
149
 
150
- test_dataset = TestDataset(CFG, test_ds)
151
- test_loader = DataLoader(test_dataset,
152
- batch_size=CFG.batch_size,
153
- shuffle=False,
154
- num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
155
-
156
-
157
- prediction = inference_fn(test_loader, model, device)
158
-
159
- test_ds['prediction'] = prediction*100
160
- test_ds['prediction'] = test_ds['prediction'].clip(0, 100)
161
- csv = test_ds.to_csv(index=False)
162
- st.download_button(
163
- label="Download data as CSV",
164
- data=csv,
165
- file_name='output.csv',
166
- mime='text/csv'
167
- )
168
-
169
- else:
170
- CFG.batch_size=1
171
- test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
172
- test_dataset = TestDataset(CFG, test_ds)
173
- test_loader = DataLoader(test_dataset,
174
- batch_size=CFG.batch_size,
175
- shuffle=False,
176
- num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
177
-
178
-
179
- prediction = inference_fn(test_loader, model, device)
180
- prediction = max(min(prediction[0][0]*100, 100), 0)
181
- st.text('yiled: '+ str(prediction))
182
-
 
54
  num_workers=1
55
 
56
  if st.button('predict'):
57
+ with st.spinner('Now processing. This process takes about 30 seconds per 10 reactions.'):
58
+
59
+
60
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
61
+
62
+
63
+ def seed_everything(seed=42):
64
+ random.seed(seed)
65
+ os.environ['PYTHONHASHSEED'] = str(seed)
66
+ np.random.seed(seed)
67
+ torch.manual_seed(seed)
68
+ torch.cuda.manual_seed(seed)
69
+ torch.backends.cudnn.deterministic = True
70
+ seed_everything(seed=CFG.seed)
71
+
72
+ CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ def prepare_input(cfg, text):
75
+ inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
76
+ for k, v in inputs.items():
77
+ inputs[k] = torch.tensor(v, dtype=torch.long)
78
 
79
  return inputs
80
 
81
+ class TestDataset(Dataset):
82
+ def __init__(self, cfg, df):
83
+ self.cfg = cfg
84
+ self.inputs = df['input'].values
85
+
86
+ def __len__(self):
87
+ return len(self.inputs)
88
+
89
+ def __getitem__(self, item):
90
+ inputs = prepare_input(self.cfg, self.inputs[item])
91
+
92
+ return inputs
93
+
94
+
95
+ class RegressionModel(nn.Module):
96
+ def __init__(self, cfg, config_path=None, pretrained=False):
97
+ super().__init__()
98
+ self.cfg = cfg
99
+ if config_path is None:
100
+ self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
101
  else:
102
+ self.config = torch.load(config_path)
103
+ if pretrained:
104
+ if 't5' in cfg.model:
105
+ self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
106
+ else:
107
+ self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
108
  else:
109
+ if 't5' in cfg.model:
110
+ self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5')
111
+ else:
112
+ self.model = AutoModel.from_config(self.config)
113
+ self.model.resize_token_embeddings(len(cfg.tokenizer))
114
+ self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
115
+ self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
116
+ self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
117
+ self.fc2 = nn.Linear(self.config.hidden_size, 1)
118
+
119
+ def forward(self, inputs):
120
+ outputs = self.model(**inputs)
121
+ last_hidden_states = outputs[0]
122
+ output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
123
+ output = self.fc2(self.fc_dropout2(output))
124
+ return output
125
+
126
+
127
+
128
+ def inference_fn(test_loader, model, device):
129
+ preds = []
130
+ model.eval()
131
+ model.to(device)
132
+ tk0 = enumerate(test_loader)
133
+ for i, inputs in tk0:
134
+ for k, v in inputs.items():
135
+ inputs[k] = v.to(device)
136
+ with torch.no_grad():
137
+ y_preds = model(inputs)
138
+ preds.append(y_preds.to('cpu').numpy())
139
+ predictions = np.concatenate(preds)
140
+ return predictions
141
+
142
+ model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
143
+ state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
144
+ model.load_state_dict(state)
145
+
146
+
147
+ if CFG.uploaded_file is not None:
148
+ test_ds = pd.read_csv(CFG.uploaded_file)
149
+
150
+ test_dataset = TestDataset(CFG, test_ds)
151
+ test_loader = DataLoader(test_dataset,
152
+ batch_size=CFG.batch_size,
153
+ shuffle=False,
154
+ num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
155
+
156
+
157
+ prediction = inference_fn(test_loader, model, device)
158
+
159
+ test_ds['prediction'] = prediction*100
160
+ test_ds['prediction'] = test_ds['prediction'].clip(0, 100)
161
+ csv = test_ds.to_csv(index=False)
162
+ st.download_button(
163
+ label="Download data as CSV",
164
+ data=csv,
165
+ file_name='output.csv',
166
+ mime='text/csv'
167
+ )
168
+
169
+ else:
170
+ CFG.batch_size=1
171
+ test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
172
+ test_dataset = TestDataset(CFG, test_ds)
173
+ test_loader = DataLoader(test_dataset,
174
+ batch_size=CFG.batch_size,
175
+ shuffle=False,
176
+ num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ prediction = inference_fn(test_loader, model, device)
180
+ prediction = max(min(prediction[0][0]*100, 100), 0)
181
+ st.text('yiled: '+ str(prediction))
182
+