sagawa commited on
Commit
bbe3baf
1 Parent(s): cb7897c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import random
4
+ import itertools
5
+ import warnings
6
+ import logging
7
+ warnings.filterwarnings('ignore')
8
+ logging.disable(logging.WARNING)
9
+ import numpy as np
10
+ import pandas as pd
11
+ from tqdm.auto import tqdm
12
+ import tokenizers
13
+ import transformers
14
+ from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup
15
+ import datasets
16
+ from datasets import load_dataset, load_metric
17
+ import sentencepiece
18
+ import argparse
19
+ import torch
20
+ from torch.utils.data import Dataset, DataLoader
21
+ import torch.nn.functional as F
22
+ import torch.nn as nn
23
+ import pickle
24
+ import time
25
+ from sklearn.preprocessing import MinMaxScaler
26
+ from datasets.utils.logging import disable_progress_bar
27
+ from sklearn.metrics import mean_squared_error, r2_score
28
+ disable_progress_bar()
29
+ import streamlit as st
30
+
31
+ st.title('predictyield-t5')
32
+ st.markdown('### At this space, you can predict the yields of reactions from their inputs.')
33
+ st.markdown('### The format of the string is like "REACTANT:{reactants of the reaction}REAGENT:{reagents, catalysts, or solvents of the reaction}PRODUCT:{products of the reaction}".')
34
+ st.markdown('### If there are no reagents or catalysts, fill the blank with a space. And if there are multiple reactants, concatenate them with "."')
35
+ display_text = 'input the reaction smiles (e.g. REACTANT:CC(C)n1ncnc1-c1cn2c(n1)-c1cnc(O)cc1OCC2.CCN(C(C)C)C(C)C.Cl.NC(=O)[C@@H]1C[C@H](F)CN1REAGENT: PRODUCT:O=C(NNC(=O)C(F)(F)F)C(F)(F)F'
36
+
37
+
38
+ class CFG():
39
+ data = st.text_area(display_text)
40
+ pretrained_model_name_or_path = 'sagawa/ZINC-t5'
41
+ model = 't5'
42
+ model_name_or_path = './'
43
+ max_len = 512
44
+ batch_size = 5
45
+ fc_dropout = 0.1
46
+ seed = 42
47
+
48
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
49
+
50
+
51
+ def seed_everything(seed=42):
52
+ random.seed(seed)
53
+ os.environ['PYTHONHASHSEED'] = str(seed)
54
+ np.random.seed(seed)
55
+ torch.manual_seed(seed)
56
+ torch.cuda.manual_seed(seed)
57
+ torch.backends.cudnn.deterministic = True
58
+ seed_everything(seed=CFG.seed)
59
+
60
+ CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
61
+
62
+ def prepare_input(cfg, text):
63
+ inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
64
+ for k, v in inputs.items():
65
+ inputs[k] = torch.tensor(v, dtype=torch.long)
66
+
67
+ return inputs
68
+
69
+ class TestDataset(Dataset):
70
+ def __init__(self, cfg, df):
71
+ self.cfg = cfg
72
+ self.inputs = df['input'].values
73
+
74
+ def __len__(self):
75
+ return len(self.inputs)
76
+
77
+ def __getitem__(self, item):
78
+ inputs = prepare_input(self.cfg, self.inputs[item])
79
+
80
+ return inputs
81
+
82
+
83
+ class RegressionModel(nn.Module):
84
+ def __init__(self, cfg, config_path=None, pretrained=False):
85
+ super().__init__()
86
+ self.cfg = cfg
87
+ if config_path is None:
88
+ self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
89
+ else:
90
+ self.config = torch.load(config_path)
91
+ if pretrained:
92
+ if 't5' in cfg.pretrained_model_name_or_path:
93
+ self.model = T5EncoderModel.from_pretrained(CFG.pretrained_model_name_or_path)
94
+ else:
95
+ self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
96
+ else:
97
+ if 't5' in cfg.model_name_or_path:
98
+ self.model = T5EncoderModel.from_pretrained('sagawa/ZINC-t5')
99
+ else:
100
+ self.model = AutoModel.from_config(self.config)
101
+ self.model.resize_token_embeddings(len(cfg.tokenizer))
102
+ self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
103
+ self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
104
+ self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
105
+ self.fc2 = nn.Linear(self.config.hidden_size, 1)
106
+
107
+ def forward(self, inputs):
108
+ outputs = self.model(**inputs)
109
+ last_hidden_states = outputs[0]
110
+ output = self.fc1(self.fc_dropout1(last_hidden_states)[:, 0, :].view(-1, self.config.hidden_size))
111
+ output = self.fc2(self.fc_dropout2(output))
112
+ return output
113
+
114
+
115
+
116
+ def inference_fn(test_loader, model, device):
117
+ preds = []
118
+ model.eval()
119
+ model.to(device)
120
+ tk0 = tqdm(test_loader, total=len(test_loader))
121
+ for inputs in tk0:
122
+ for k, v in inputs.items():
123
+ inputs[k] = v.to(device)
124
+ with torch.no_grad():
125
+ y_preds = model(inputs)
126
+ preds.append(y_preds.to('cpu').numpy())
127
+ predictions = np.concatenate(preds)
128
+ return predictions
129
+
130
+ model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
131
+ state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
132
+ model.load_state_dict(state)
133
+
134
+
135
+
136
+ test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
137
+ test_dataset = TestDataset(CFG, test_ds)
138
+ test_loader = DataLoader(test_dataset,
139
+ batch_size=1,
140
+ shuffle=False,
141
+ num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
142
+
143
+
144
+ prediction = inference_fn(test_loader, model, device)
145
+ prediction = max(min(prediction[0][0]*100, 100), 0)
146
+ st.text('yiled: '+ str(prediction))
147
+