sagawa commited on
Commit
b15be69
1 Parent(s): bd5de6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -92
app.py CHANGED
@@ -40,32 +40,84 @@ class CFG():
40
  seed = 42
41
 
42
  if st.button('predict'):
43
-
44
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
45
-
46
- def seed_everything(seed=42):
47
- random.seed(seed)
48
- os.environ['PYTHONHASHSEED'] = str(seed)
49
- np.random.seed(seed)
50
- torch.manual_seed(seed)
51
- torch.cuda.manual_seed(seed)
52
- torch.backends.cudnn.deterministic = True
53
- seed_everything(seed=CFG.seed)
54
-
55
-
56
- tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
57
 
58
- if CFG.model == 't5':
59
- model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name_or_path).to(device)
60
- elif CFG.model == 'deberta':
61
- model = EncoderDecoderModel.from_pretrained(CFG.model_name_or_path).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- if CFG.uploaded_file is not None:
65
- input_data = pd.read_csv(CFG.uploaded_file)
66
- outputs = []
67
- for idx, row in input_data.iterrows():
68
- input_compound = row['input']
 
 
 
 
 
 
 
 
 
 
 
 
69
  min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
70
  inp = tokenizer(input_compound, return_tensors='pt').to(device)
71
  output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
@@ -83,8 +135,7 @@ if st.button('predict'):
83
  scores.append(None)
84
  output += scores
85
  output = [input_compound] + output
86
- outputs.append(output)
87
-
88
  else:
89
  output = [tokenizer.decode(output['sequences'][0], skip_special_tokens=True).replace('. ', '.').rstrip('.')]
90
  mol = Chem.MolFromSmiles(output[0])
@@ -92,74 +143,24 @@ if st.button('predict'):
92
  output.append(output[0])
93
  else:
94
  output.append(None)
95
- output = [input_compound] + output
96
- outputs.append(output)
97
 
98
- if CFG.num_beams > 1:
99
- output_df = pd.DataFrame(outputs, columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
100
- else:
101
- output_df = pd.DataFrame(outputs, columns=['input', '0th', 'valid compound'])
102
-
103
-
104
- @st.cache
105
- def convert_df(df):
106
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
107
- return df.to_csv(index=False)
108
-
109
- csv = convert_df(output_df)
110
 
111
- st.download_button(
112
- label="Download data as CSV",
113
- data=csv,
114
- file_name='output.csv',
115
- mime='text/csv',
116
- )
117
-
118
- else:
119
- input_compound = CFG.input_data
120
- min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
121
- inp = tokenizer(input_compound, return_tensors='pt').to(device)
122
- output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
123
- if CFG.num_beams > 1:
124
- scores = output['sequences_scores'].tolist()
125
- output = [tokenizer.decode(i, skip_special_tokens=True).replace('. ', '.').rstrip('.') for i in output['sequences']]
126
- for ith, out in enumerate(output):
127
- mol = Chem.MolFromSmiles(out.rstrip('.'))
128
- if type(mol) == rdkit.Chem.rdchem.Mol:
129
- output.append(out.rstrip('.'))
130
- scores.append(scores[ith])
131
- break
132
- if type(mol) == None:
133
- output.append(None)
134
- scores.append(None)
135
- output += scores
136
- output = [input_compound] + output
137
-
138
- else:
139
- output = [tokenizer.decode(output['sequences'][0], skip_special_tokens=True).replace('. ', '.').rstrip('.')]
140
- mol = Chem.MolFromSmiles(output[0])
141
- if type(mol) == rdkit.Chem.rdchem.Mol:
142
- output.append(output[0])
143
  else:
144
- output.append(None)
145
-
146
 
147
- if CFG.num_beams > 1:
148
- output_df = pd.DataFrame(np.array(output).reshape(1, -1), columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
149
- else:
150
- output_df = pd.DataFrame(np.array([input_compound]+output).reshape(1, -1), columns=['input', '0th', 'valid compound'])
151
- st.table(output_df)
152
-
153
- @st.cache
154
- def convert_df(df):
155
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
156
- return df.to_csv(index=False)
157
-
158
- csv = convert_df(output_df)
159
-
160
- st.download_button(
161
- label="Download data as CSV",
162
- data=csv,
163
- file_name='output.csv',
164
- mime='text/csv',
165
- )
 
40
  seed = 42
41
 
42
  if st.button('predict'):
43
+ with st.spinner('Now processing. If num beams=5, this process takes about 15 seconds per reaction.'):
44
+
45
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
46
+
47
+ def seed_everything(seed=42):
48
+ random.seed(seed)
49
+ os.environ['PYTHONHASHSEED'] = str(seed)
50
+ np.random.seed(seed)
51
+ torch.manual_seed(seed)
52
+ torch.cuda.manual_seed(seed)
53
+ torch.backends.cudnn.deterministic = True
54
+ seed_everything(seed=CFG.seed)
 
 
55
 
56
+
57
+ tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
58
+
59
+ if CFG.model == 't5':
60
+ model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name_or_path).to(device)
61
+ elif CFG.model == 'deberta':
62
+ model = EncoderDecoderModel.from_pretrained(CFG.model_name_or_path).to(device)
63
+
64
+
65
+ if CFG.uploaded_file is not None:
66
+ input_data = pd.read_csv(CFG.uploaded_file)
67
+ outputs = []
68
+ for idx, row in input_data.iterrows():
69
+ input_compound = row['input']
70
+ min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
71
+ inp = tokenizer(input_compound, return_tensors='pt').to(device)
72
+ output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
73
+ if CFG.num_beams > 1:
74
+ scores = output['sequences_scores'].tolist()
75
+ output = [tokenizer.decode(i, skip_special_tokens=True).replace('. ', '.').rstrip('.') for i in output['sequences']]
76
+ for ith, out in enumerate(output):
77
+ mol = Chem.MolFromSmiles(out.rstrip('.'))
78
+ if type(mol) == rdkit.Chem.rdchem.Mol:
79
+ output.append(out.rstrip('.'))
80
+ scores.append(scores[ith])
81
+ break
82
+ if type(mol) == None:
83
+ output.append(None)
84
+ scores.append(None)
85
+ output += scores
86
+ output = [input_compound] + output
87
+ outputs.append(output)
88
 
89
+ else:
90
+ output = [tokenizer.decode(output['sequences'][0], skip_special_tokens=True).replace('. ', '.').rstrip('.')]
91
+ mol = Chem.MolFromSmiles(output[0])
92
+ if type(mol) == rdkit.Chem.rdchem.Mol:
93
+ output.append(output[0])
94
+ else:
95
+ output.append(None)
96
+ output = [input_compound] + output
97
+ outputs.append(output)
98
+
99
+ if CFG.num_beams > 1:
100
+ output_df = pd.DataFrame(outputs, columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
101
+ else:
102
+ output_df = pd.DataFrame(outputs, columns=['input', '0th', 'valid compound'])
103
 
104
+
105
+ @st.cache
106
+ def convert_df(df):
107
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
108
+ return df.to_csv(index=False)
109
+
110
+ csv = convert_df(output_df)
111
+
112
+ st.download_button(
113
+ label="Download data as CSV",
114
+ data=csv,
115
+ file_name='output.csv',
116
+ mime='text/csv',
117
+ )
118
+
119
+ else:
120
+ input_compound = CFG.input_data
121
  min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
122
  inp = tokenizer(input_compound, return_tensors='pt').to(device)
123
  output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
 
135
  scores.append(None)
136
  output += scores
137
  output = [input_compound] + output
138
+
 
139
  else:
140
  output = [tokenizer.decode(output['sequences'][0], skip_special_tokens=True).replace('. ', '.').rstrip('.')]
141
  mol = Chem.MolFromSmiles(output[0])
 
143
  output.append(output[0])
144
  else:
145
  output.append(None)
 
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ if CFG.num_beams > 1:
149
+ output_df = pd.DataFrame(np.array(output).reshape(1, -1), columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  else:
151
+ output_df = pd.DataFrame(np.array([input_compound]+output).reshape(1, -1), columns=['input', '0th', 'valid compound'])
152
+ st.table(output_df)
153
 
154
+ @st.cache
155
+ def convert_df(df):
156
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
157
+ return df.to_csv(index=False)
158
+
159
+ csv = convert_df(output_df)
160
+
161
+ st.download_button(
162
+ label="Download data as CSV",
163
+ data=csv,
164
+ file_name='output.csv',
165
+ mime='text/csv',
166
+ )