sagawa commited on
Commit
da0c38f
1 Parent(s): 63be818

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -92
app.py CHANGED
@@ -29,13 +29,14 @@ disable_progress_bar()
29
  import streamlit as st
30
 
31
  st.title('predictyield-t5')
32
- st.markdown('### At this space, you can predict the yields of reactions from their inputs.')
33
- st.markdown('### The format of the string is like "REACTANT:{reactants of the reaction}REAGENT:{reagents, catalysts, or solvents of the reaction}PRODUCT:{products of the reaction}".')
34
- st.markdown('### If there are no reagents or catalysts, fill the blank with a space. And if there are multiple reactants, concatenate them with "."')
35
  display_text = 'input the reaction smiles (e.g. REACTANT:CC(C)n1ncnc1-c1cn2c(n1)-c1cnc(O)cc1OCC2.CCN(C(C)C)C(C)C.Cl.NC(=O)[C@@H]1C[C@H](F)CN1REAGENT: PRODUCT:O=C(NNC(=O)C(F)(F)F)C(F)(F)F)'
36
 
37
 
38
  class CFG():
 
39
  data = st.text_area(display_text)
40
  pretrained_model_name_or_path = 'sagawa/ZINC-t5'
41
  model = 't5'
@@ -46,103 +47,127 @@ class CFG():
46
  seed = 42
47
  num_workers=1
48
 
49
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
50
-
51
-
52
- def seed_everything(seed=42):
53
- random.seed(seed)
54
- os.environ['PYTHONHASHSEED'] = str(seed)
55
- np.random.seed(seed)
56
- torch.manual_seed(seed)
57
- torch.cuda.manual_seed(seed)
58
- torch.backends.cudnn.deterministic = True
59
- seed_everything(seed=CFG.seed)
60
-
61
- CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
62
-
63
- def prepare_input(cfg, text):
64
- inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
65
- for k, v in inputs.items():
66
- inputs[k] = torch.tensor(v, dtype=torch.long)
67
 
68
- return inputs
69
-
70
- class TestDataset(Dataset):
71
- def __init__(self, cfg, df):
72
- self.cfg = cfg
73
- self.inputs = df['input'].values
74
-
75
- def __len__(self):
76
- return len(self.inputs)
 
 
77
 
78
- def __getitem__(self, item):
79
- inputs = prepare_input(self.cfg, self.inputs[item])
 
 
 
 
80
 
81
  return inputs
82
 
83
-
84
- class RegressionModel(nn.Module):
85
- def __init__(self, cfg, config_path=None, pretrained=False):
86
- super().__init__()
87
- self.cfg = cfg
88
- if config_path is None:
89
- self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
90
- else:
91
- self.config = torch.load(config_path)
92
- if pretrained:
93
- if 't5' in cfg.model:
94
- self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
 
 
 
 
 
 
 
 
95
  else:
96
- self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
97
- else:
98
- if 't5' in cfg.model:
99
- self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5')
 
 
100
  else:
101
- self.model = AutoModel.from_config(self.config)
102
- self.model.resize_token_embeddings(len(cfg.tokenizer))
103
- self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
104
- self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
105
- self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
106
- self.fc2 = nn.Linear(self.config.hidden_size, 1)
 
 
 
 
 
 
 
 
 
 
107
 
108
- def forward(self, inputs):
109
- outputs = self.model(**inputs)
110
- last_hidden_states = outputs[0]
111
- output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
112
- output = self.fc2(self.fc_dropout2(output))
113
- return output
114
 
115
-
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- def inference_fn(test_loader, model, device):
118
- preds = []
119
- model.eval()
120
- model.to(device)
121
- tk0 = tqdm(test_loader, total=len(test_loader))
122
- for inputs in tk0:
123
- for k, v in inputs.items():
124
- inputs[k] = v.to(device)
125
- with torch.no_grad():
126
- y_preds = model(inputs)
127
- preds.append(y_preds.to('cpu').numpy())
128
- predictions = np.concatenate(preds)
129
- return predictions
130
-
131
- model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
132
- state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
133
- model.load_state_dict(state)
134
-
135
-
136
-
137
- test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
138
- test_dataset = TestDataset(CFG, test_ds)
139
- test_loader = DataLoader(test_dataset,
140
- batch_size=1,
141
- shuffle=False,
142
- num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
143
-
144
-
145
- prediction = inference_fn(test_loader, model, device)
146
- prediction = max(min(prediction[0][0]*100, 100), 0)
147
- st.text('yiled: '+ str(prediction))
 
 
 
 
 
 
 
 
148
 
 
29
  import streamlit as st
30
 
31
  st.title('predictyield-t5')
32
+ st.markdown('##### At this space, you can predict the yields of reactions from their inputs.')
33
+ st.markdown('##### The code expects input_data as a string or CSV file that contains an "input" column. The format of the string or contents of the column are like "REACTANT:{reactants of the reaction}REAGENT:{reagents, catalysts, or solvents of the reaction}PRODUCT:{products of the reaction}".')
34
+ st.markdown('##### If there are no reagents or catalysts, fill the blank with a space. And if there are multiple reactants, concatenate them with "."')
35
  display_text = 'input the reaction smiles (e.g. REACTANT:CC(C)n1ncnc1-c1cn2c(n1)-c1cnc(O)cc1OCC2.CCN(C(C)C)C(C)C.Cl.NC(=O)[C@@H]1C[C@H](F)CN1REAGENT: PRODUCT:O=C(NNC(=O)C(F)(F)F)C(F)(F)F)'
36
 
37
 
38
  class CFG():
39
+ uploaded_file = st.file_uploader("Choose a CSV file")
40
  data = st.text_area(display_text)
41
  pretrained_model_name_or_path = 'sagawa/ZINC-t5'
42
  model = 't5'
 
47
  seed = 42
48
  num_workers=1
49
 
50
+ if st.button('predict'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
53
+
54
+
55
+ def seed_everything(seed=42):
56
+ random.seed(seed)
57
+ os.environ['PYTHONHASHSEED'] = str(seed)
58
+ np.random.seed(seed)
59
+ torch.manual_seed(seed)
60
+ torch.cuda.manual_seed(seed)
61
+ torch.backends.cudnn.deterministic = True
62
+ seed_everything(seed=CFG.seed)
63
 
64
+ CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
65
+
66
+ def prepare_input(cfg, text):
67
+ inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
68
+ for k, v in inputs.items():
69
+ inputs[k] = torch.tensor(v, dtype=torch.long)
70
 
71
  return inputs
72
 
73
+ class TestDataset(Dataset):
74
+ def __init__(self, cfg, df):
75
+ self.cfg = cfg
76
+ self.inputs = df['input'].values
77
+
78
+ def __len__(self):
79
+ return len(self.inputs)
80
+
81
+ def __getitem__(self, item):
82
+ inputs = prepare_input(self.cfg, self.inputs[item])
83
+
84
+ return inputs
85
+
86
+
87
+ class RegressionModel(nn.Module):
88
+ def __init__(self, cfg, config_path=None, pretrained=False):
89
+ super().__init__()
90
+ self.cfg = cfg
91
+ if config_path is None:
92
+ self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
93
  else:
94
+ self.config = torch.load(config_path)
95
+ if pretrained:
96
+ if 't5' in cfg.model:
97
+ self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
98
+ else:
99
+ self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
100
  else:
101
+ if 't5' in cfg.model:
102
+ self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5')
103
+ else:
104
+ self.model = AutoModel.from_config(self.config)
105
+ self.model.resize_token_embeddings(len(cfg.tokenizer))
106
+ self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
107
+ self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
108
+ self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
109
+ self.fc2 = nn.Linear(self.config.hidden_size, 1)
110
+
111
+ def forward(self, inputs):
112
+ outputs = self.model(**inputs)
113
+ last_hidden_states = outputs[0]
114
+ output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
115
+ output = self.fc2(self.fc_dropout2(output))
116
+ return output
117
 
 
 
 
 
 
 
118
 
119
+
120
+ def inference_fn(test_loader, model, device):
121
+ preds = []
122
+ model.eval()
123
+ model.to(device)
124
+ tk0 = tqdm(test_loader, total=len(test_loader))
125
+ for inputs in tk0:
126
+ for k, v in inputs.items():
127
+ inputs[k] = v.to(device)
128
+ with torch.no_grad():
129
+ y_preds = model(inputs)
130
+ preds.append(y_preds.to('cpu').numpy())
131
+ predictions = np.concatenate(preds)
132
+ return predictions
133
 
134
+ model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
135
+ state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
136
+ model.load_state_dict(state)
137
+
138
+
139
+ if CFG.uploaded_file is not None:
140
+ test_ds = pd.read_csv(CFG.uploaded_file)
141
+
142
+ test_dataset = TestDataset(CFG, test_ds)
143
+ test_loader = DataLoader(test_dataset,
144
+ batch_size=CFG.batch_size,
145
+ shuffle=False,
146
+ num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
147
+
148
+
149
+ prediction = inference_fn(test_loader, model, device)
150
+
151
+ test_ds['prediction'] = prediction*100
152
+ test_ds['prediction'] = test_ds['prediction'].clip(0, 100)
153
+ csv = test_ds.to_csv(index=False)
154
+ st.download_button(
155
+ label="Download data as CSV",
156
+ data=csv,
157
+ file_name='output.csv',
158
+ mime='text/csv'
159
+ )
160
+
161
+ else:
162
+ test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
163
+ test_dataset = TestDataset(CFG, test_ds)
164
+ test_loader = DataLoader(test_dataset,
165
+ batch_size=1,
166
+ shuffle=False,
167
+ num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
168
+
169
+
170
+ prediction = inference_fn(test_loader, model, device)
171
+ prediction = max(min(prediction[0][0]*100, 100), 0)
172
+ st.text('yiled: '+ str(prediction))
173