sagawa commited on
Commit
70b4c2a
1 Parent(s): aa9d78c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -79
app.py CHANGED
@@ -29,32 +29,69 @@ class CFG():
29
  model = 't5'
30
  seed = 42
31
 
32
-
33
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
34
-
35
- def seed_everything(seed=42):
36
- random.seed(seed)
37
- os.environ['PYTHONHASHSEED'] = str(seed)
38
- np.random.seed(seed)
39
- torch.manual_seed(seed)
40
- torch.cuda.manual_seed(seed)
41
- torch.backends.cudnn.deterministic = True
42
- seed_everything(seed=CFG.seed)
43
-
44
-
45
- tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
46
 
47
- if CFG.model == 't5':
48
- model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name_or_path).to(device)
49
- elif CFG.model == 'deberta':
50
- model = EncoderDecoderModel.from_pretrained(CFG.model_name_or_path).to(device)
51
-
52
-
53
- if CFG.uploaded_file is not None:
54
- input_data = pd.read_csv(CFG.uploaded_file)
55
- outputs = []
56
- for idx, row in input_data.iterrows():
57
- input_compound = row['input']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
59
  inp = tokenizer(input_compound, return_tensors='pt').to(device)
60
  output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
@@ -71,59 +108,23 @@ if CFG.uploaded_file is not None:
71
  scores.append(None)
72
  output += scores
73
  output = [input_compound] + output
74
- outputs.append(output)
75
-
76
- output_df = pd.DataFrame(outputs, columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
77
 
78
- @st.cache
79
- def convert_df(df):
80
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
81
- return df.to_csv(index=False)
82
-
83
- csv = convert_df(output_df)
84
 
85
- st.download_button(
86
- label="Download data as CSV",
87
- data=csv,
88
- file_name='output.csv',
89
- mime='text/csv',
90
- )
91
-
92
- else:
93
- input_compound = CFG.input_data
94
- min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
95
- inp = tokenizer(input_compound, return_tensors='pt').to(device)
96
- output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
97
- scores = output['sequences_scores'].tolist()
98
- output = [tokenizer.decode(i, skip_special_tokens=True).replace('. ', '.').rstrip('.') for i in output['sequences']]
99
- for ith, out in enumerate(output):
100
- mol = Chem.MolFromSmiles(out.rstrip('.'))
101
- if type(mol) == rdkit.Chem.rdchem.Mol:
102
- output.append(out.rstrip('.'))
103
- scores.append(scores[ith])
104
- break
105
- if type(mol) == None:
106
- output.append(None)
107
- scores.append(None)
108
- output += scores
109
- output = [input_compound] + output
110
- try:
111
- output_df = pd.DataFrame(np.array(output).reshape(1, -1), columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
112
- st.table(output_df)
113
-
114
- @st.cache
115
- def convert_df(df):
116
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
117
- return df.to_csv(index=False)
118
-
119
- csv = convert_df(output_df)
120
-
121
- st.download_button(
122
- label="Download data as CSV",
123
- data=csv,
124
- file_name='output.csv',
125
- mime='text/csv',
126
- )
127
-
128
- except:
129
- pass
 
29
  model = 't5'
30
  seed = 42
31
 
32
+ if st.button('predict'):
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
35
+
36
+ def seed_everything(seed=42):
37
+ random.seed(seed)
38
+ os.environ['PYTHONHASHSEED'] = str(seed)
39
+ np.random.seed(seed)
40
+ torch.manual_seed(seed)
41
+ torch.cuda.manual_seed(seed)
42
+ torch.backends.cudnn.deterministic = True
43
+ seed_everything(seed=CFG.seed)
44
+
45
+
46
+ tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
47
+
48
+ if CFG.model == 't5':
49
+ model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name_or_path).to(device)
50
+ elif CFG.model == 'deberta':
51
+ model = EncoderDecoderModel.from_pretrained(CFG.model_name_or_path).to(device)
52
+
53
+
54
+ if CFG.uploaded_file is not None:
55
+ input_data = pd.read_csv(CFG.uploaded_file)
56
+ outputs = []
57
+ for idx, row in input_data.iterrows():
58
+ input_compound = row['input']
59
+ min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
60
+ inp = tokenizer(input_compound, return_tensors='pt').to(device)
61
+ output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
62
+ scores = output['sequences_scores'].tolist()
63
+ output = [tokenizer.decode(i, skip_special_tokens=True).replace('. ', '.').rstrip('.') for i in output['sequences']]
64
+ for ith, out in enumerate(output):
65
+ mol = Chem.MolFromSmiles(out.rstrip('.'))
66
+ if type(mol) == rdkit.Chem.rdchem.Mol:
67
+ output.append(out.rstrip('.'))
68
+ scores.append(scores[ith])
69
+ break
70
+ if type(mol) == None:
71
+ output.append(None)
72
+ scores.append(None)
73
+ output += scores
74
+ output = [input_compound] + output
75
+ outputs.append(output)
76
+
77
+ output_df = pd.DataFrame(outputs, columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
78
+
79
+ @st.cache
80
+ def convert_df(df):
81
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
82
+ return df.to_csv(index=False)
83
+
84
+ csv = convert_df(output_df)
85
+
86
+ st.download_button(
87
+ label="Download data as CSV",
88
+ data=csv,
89
+ file_name='output.csv',
90
+ mime='text/csv',
91
+ )
92
+
93
+ else:
94
+ input_compound = CFG.input_data
95
  min_length = min(input_compound.find('CATALYST') - input_compound.find(':') - 10, 0)
96
  inp = tokenizer(input_compound, return_tensors='pt').to(device)
97
  output = model.generate(**inp, min_length=min_length, max_length=min_length+50, num_beams=CFG.num_beams, num_return_sequences=CFG.num_return_sequences, return_dict_in_generate=True, output_scores=True)
 
108
  scores.append(None)
109
  output += scores
110
  output = [input_compound] + output
111
+ try:
112
+ output_df = pd.DataFrame(np.array(output).reshape(1, -1), columns=['input'] + [f'{i}th' for i in range(CFG.num_beams)] + ['valid compound'] + [f'{i}th score' for i in range(CFG.num_beams)] + ['valid compound score'])
113
+ st.table(output_df)
114
 
115
+ @st.cache
116
+ def convert_df(df):
117
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
118
+ return df.to_csv(index=False)
 
 
119
 
120
+ csv = convert_df(output_df)
121
+
122
+ st.download_button(
123
+ label="Download data as CSV",
124
+ data=csv,
125
+ file_name='output.csv',
126
+ mime='text/csv',
127
+ )
128
+
129
+ except:
130
+ pass