ipd commited on
Commit
44be2ad
·
1 Parent(s): 98e9763
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +13 -0
  2. Dockerfile-conda +13 -0
  3. README.md +1 -1
  4. app.py +489 -323
  5. data/lce/test.csv +31 -0
  6. data/lce/test_data.csv +14 -0
  7. data/lce/train.csv +121 -0
  8. data/lce/train_data.csv +148 -0
  9. models/.gitattributes +3 -0
  10. models/fm4m.py +366 -74
  11. models/mhg_model/README.md +1 -1
  12. models/mhg_model/images/mhg_example.png +0 -0
  13. models/mhg_model/images/mhg_example1.png +0 -0
  14. models/mhg_model/images/mhg_example2.png +0 -0
  15. models/mhg_model/load.py +22 -3
  16. models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf +0 -0
  17. models/selfies_model/selfies-ted.png +0 -0
  18. models/selfies_ted/README.md +87 -0
  19. models/selfies_ted/load.py +92 -0
  20. models/selfies_ted/requirements.txt +12 -0
  21. models/selfies_ted/selfies-ted-example.ipynb +136 -0
  22. models/selfies_ted/selfies-ted.png +3 -0
  23. models/smi_ted/.gitignore +18 -0
  24. models/smi_ted/README.md +138 -0
  25. models/smi_ted/finetune/args.py +337 -0
  26. models/smi_ted/finetune/finetune_classification.py +68 -0
  27. models/smi_ted/finetune/finetune_classification_multitask.py +101 -0
  28. models/smi_ted/finetune/finetune_regression.py +70 -0
  29. models/smi_ted/finetune/moleculenet/bace/test.csv +3 -0
  30. models/smi_ted/finetune/moleculenet/bace/train.csv +3 -0
  31. models/smi_ted/finetune/moleculenet/bace/valid.csv +3 -0
  32. models/smi_ted/finetune/moleculenet/bbbp/test.csv +3 -0
  33. models/smi_ted/finetune/moleculenet/bbbp/train.csv +3 -0
  34. models/smi_ted/finetune/moleculenet/bbbp/valid.csv +3 -0
  35. models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv +3 -0
  36. models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv +3 -0
  37. models/smi_ted/finetune/moleculenet/biodegradability/test.csv +3 -0
  38. models/smi_ted/finetune/moleculenet/biodegradability/train.csv +3 -0
  39. models/smi_ted/finetune/moleculenet/biodegradability/valid.csv +3 -0
  40. models/smi_ted/finetune/moleculenet/clintox/test.csv +3 -0
  41. models/smi_ted/finetune/moleculenet/clintox/train.csv +3 -0
  42. models/smi_ted/finetune/moleculenet/clintox/valid.csv +3 -0
  43. models/smi_ted/finetune/moleculenet/esol/test.csv +3 -0
  44. models/smi_ted/finetune/moleculenet/esol/train.csv +3 -0
  45. models/smi_ted/finetune/moleculenet/esol/valid.csv +3 -0
  46. models/smi_ted/finetune/moleculenet/freesolv/test.csv +3 -0
  47. models/smi_ted/finetune/moleculenet/freesolv/train.csv +3 -0
  48. models/smi_ted/finetune/moleculenet/freesolv/valid.csv +3 -0
  49. models/smi_ted/finetune/moleculenet/hiv/test.csv +3 -0
  50. models/smi_ted/finetune/moleculenet/hiv/train.csv +3 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9.7
2
+
3
+ WORKDIR /app
4
+ COPY requirements.txt .
5
+ RUN pip install -r requirements.txt
6
+ # preload models
7
+ RUN python -c '\
8
+ from transformers import BartForConditionalGeneration, AutoTokenizer;\
9
+ AutoTokenizer.from_pretrained("ibm/materials.selfies-ted");\
10
+ BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")'
11
+ COPY . .
12
+
13
+ CMD ["python", "app.py"]
Dockerfile-conda ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM condaforge/miniforge3
2
+
3
+ WORKDIR /app
4
+ SHELL ["/bin/bash", "-i", "-c"]
5
+ RUN apt-get update && \
6
+ apt-get install -y build-essential libxrender1 libxext-dev
7
+ RUN conda create --name fm4m python=3.9.7
8
+ RUN conda activate fm4m
9
+ COPY requirements.txt .
10
+ RUN pip install -r requirements.txt
11
+ COPY . .
12
+
13
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Fm4m Kit
3
  emoji: 🐢
4
  colorFrom: indigo
5
  colorTo: blue
 
1
  ---
2
+ title: Fix Fm4m Kit
3
  emoji: 🐢
4
  colorFrom: indigo
5
  colorTo: blue
app.py CHANGED
@@ -1,142 +1,103 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
  import matplotlib.pyplot as plt
4
- from PIL import Image
5
- from rdkit.Chem import Descriptors, QED, Draw
6
- from rdkit.Chem.Crippen import MolLogP
7
  import pandas as pd
8
- from rdkit.Contrib.SA_Score import sascorer
9
- from rdkit.Chem import DataStructs, AllChem
10
- from transformers import BartForConditionalGeneration, AutoTokenizer, AutoModel
11
- from transformers.modeling_outputs import BaseModelOutput
12
  import selfies as sf
13
- from rdkit import Chem
14
  import torch
15
- import numpy as np
16
- import umap
17
- import pickle
18
  import xgboost as xgb
19
- from sklearn.svm import SVR
20
- from sklearn.linear_model import LinearRegression
 
 
 
21
  from sklearn.kernel_ridge import KernelRidge
22
- import json
23
-
24
- import os
 
25
 
26
  os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
27
 
28
- # my_theme = gr.Theme.from_hub("ysharma/steampunk")
29
- # my_theme = gr.themes.Glass()
30
-
31
- """
32
- # カスタムテーマ設定
33
- theme = gr.themes.Default().set(
34
- body_background_fill="#000000", # 背景色を黒に設定
35
- text_color="#FFFFFF", # テキスト色を白に設定
36
- )
37
- """
38
- """
39
- import sys
40
- sys.path.append("models")
41
- sys.path.append("../models")
42
- sys.path.append("../")"""
43
-
44
-
45
- # Get the current file's directory
46
- base_dir = os.path.dirname(__file__)
47
- print("Base Dir : ", base_dir)
48
-
49
  import models.fm4m as fm4m
50
 
 
 
51
 
52
  # Function to display molecule image from SMILES
53
  def smiles_to_image(smiles):
54
  mol = Chem.MolFromSmiles(smiles)
55
- if mol:
56
- img = Draw.MolToImage(mol)
57
- return img
58
- return None
59
-
60
-
61
- # Function to get canonical SMILES
62
- def get_canonical_smiles(smiles):
63
- mol = Chem.MolFromSmiles(smiles)
64
- if mol:
65
- return Chem.MolToSmiles(mol, canonical=True)
66
- return None
67
 
68
 
69
  # Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
70
  smiles_image_mapping = {
71
- "Mol 1": {"smiles": "C=C(C)CC(=O)NC[C@H](CO)NC(=O)C=Cc1ccc(C)c(Cl)c1", "image": "img/img1.png"},
 
 
 
72
  # Example SMILES for ethanol
73
- "Mol 2": {"smiles": "C=CC1(CC(=O)NC[C@@H](CCCC)NC(=O)c2cc(Cl)cc(Br)c2)CC1", "image": "img/img2.png"},
 
 
 
74
  # Example SMILES for butane
75
- "Mol 3": {"smiles": "C=C(C)C[C@H](NC(C)=O)C(=O)N1CC[C@H](NC(=O)[C@H]2C[C@@]2(C)Br)C(C)(C)C1",
76
- "image": "img/img3.png"}, # Example SMILES for ethylamine
77
- "Mol 4": {"smiles": "C=C1CC(CC(=O)N[C@H]2CCN(C(=O)c3ncccc3SC)C23CC3)C1", "image": "img/img4.png"},
 
 
 
 
 
78
  # Example SMILES for diethyl ether
79
- "Mol 5": {"smiles": "C=CCS[C@@H](C)CC(=O)OCC", "image": "img/img5.png"} # Example SMILES for chloroethane
 
 
 
80
  }
81
 
82
  datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
83
 
84
- models_enabled = ["SELFIES-TED", "MHG-GED", "MolFormer", "SMI-TED"]
 
 
 
 
 
 
 
85
 
86
  fusion_available = ["Concat"]
87
 
88
- global log_df
89
- log_df = pd.DataFrame(columns=["Selected Models", "Dataset", "Task", "Result"])
90
-
91
-
92
- def log_selection(models, dataset, task_type, result, log_df):
93
- # Append the new entry to the DataFrame
94
- new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_type, "Result": result}
95
- updated_log_df = log_df.append(new_entry, ignore_index=True)
96
- return updated_log_df
97
-
98
 
99
  # Function to handle evaluation and logging
100
- def save_rep(models, dataset, task_type, eval_output):
101
- return
102
- def evaluate_and_log(models, dataset, task_type, eval_output):
103
  task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
104
- result = f"{eval_output}"#display_eval(models, dataset, task_type, fusion_type=None)
105
  result = result.replace(" Score", "")
106
 
107
- new_entry = {"Selected Models": str(models), "Dataset": dataset, "Task": task_dic[task_type], "Result": result}
 
 
 
 
 
108
  new_entry_df = pd.DataFrame([new_entry])
109
 
110
- log_df = pd.read_csv('log.csv', index_col=0)
111
- log_df = pd.concat([new_entry_df, log_df])
112
-
113
- log_df.to_csv('log.csv')
114
-
115
- return log_df
116
-
117
-
118
- try:
119
- log_df = pd.read_csv('log.csv', index_col=0)
120
- except:
121
- log_df = pd.DataFrame({"":[],
122
- 'Selected Models': [],
123
- 'Dataset': [],
124
- 'Task': [],
125
- 'Result': []
126
- })
127
- csv_file_path = 'log.csv'
128
- log_df.to_csv(csv_file_path, index=False)
129
 
130
 
131
  # Load images for selection
132
  def load_image(path):
133
  try:
134
- return Image.open(smiles_image_mapping[path]["image"])# Image.1open(path)
135
  except:
136
  pass
137
 
138
 
139
-
140
  # Function to handle image selection
141
  def handle_image_selection(image_key):
142
  smiles = smiles_image_mapping[image_key]["smiles"]
@@ -160,49 +121,55 @@ def calculate_tanimoto(smiles1, smiles2):
160
  mol1 = Chem.MolFromSmiles(smiles1)
161
  mol2 = Chem.MolFromSmiles(smiles2)
162
  if mol1 and mol2:
163
- # fp1 = FingerprintMols.FingerprintMol(mol1)
164
- # fp2 = FingerprintMols.FingerprintMol(mol2)
165
  fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
166
  fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
167
  return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
168
  return None
169
 
170
 
171
- #with open("models/selfies_model/bart-2908.pickle", "rb") as input_file:
172
- # gen_model, gen_tokenizer = pickle.load(input_file)
173
-
174
  gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
175
  gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
176
 
177
 
178
  def generate(latent_vector, mask):
179
  encoder_outputs = BaseModelOutput(latent_vector)
180
- decoder_output = gen_model.generate(encoder_outputs=encoder_outputs, attention_mask=mask,
181
- max_new_tokens=64, do_sample=True, top_k=5, top_p=0.95, num_return_sequences=1)
 
 
 
 
 
 
 
182
  selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
183
- outs = []
184
- for i in selfies:
185
- outs.append(sf.decoder(i.replace("] [", "][")))
186
- return outs
187
 
188
 
189
  def perturb_latent(latent_vecs, noise_scale=0.5):
190
- modified_vec = torch.tensor(np.random.uniform(0, 1, latent_vecs.shape) * noise_scale,
191
- dtype=torch.float32) + latent_vecs
192
- return modified_vec
 
 
 
 
193
 
194
 
195
  def encode(selfies):
196
- encoding = gen_tokenizer(selfies, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
 
 
 
 
 
 
197
  input_ids = encoding['input_ids']
198
  attention_mask = encoding['attention_mask']
199
- outputs = gen_model.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
 
 
200
  model_output = outputs.last_hidden_state
201
-
202
- """input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
203
- sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
204
- sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
205
- model_output = sum_embeddings / sum_mask"""
206
  return model_output, attention_mask
207
 
208
 
@@ -217,8 +184,13 @@ def generate_canonical(smiles):
217
  noise = i / 10
218
  perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
219
  gen = generate(perturbed_latent, mask)
220
- gen_mol = Chem.MolToSmiles(Chem.MolFromSmiles(gen[0]))
221
- if gen_mol != Chem.MolToSmiles(Chem.MolFromSmiles(smiles)): break
 
 
 
 
 
222
 
223
  if gen_mol:
224
  # Calculate properties for ref and gen molecules
@@ -230,9 +202,20 @@ def generate_canonical(smiles):
230
  # Prepare the table with ref mol and gen mol
231
  data = {
232
  "Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
233
- "Reference Mol": [ref_properties[0], ref_properties[1], ref_properties[2], ref_properties[3],
234
- tanimoto_similarity],
235
- "Generated Mol": [gen_properties[0], gen_properties[1], gen_properties[2], gen_properties[3], ""]
 
 
 
 
 
 
 
 
 
 
 
236
  }
237
  df = pd.DataFrame(data)
238
 
@@ -245,7 +228,7 @@ def generate_canonical(smiles):
245
 
246
 
247
  # Function to display evaluation score
248
- def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
249
  result = None
250
 
251
  try:
@@ -260,72 +243,87 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
260
  downstream_model = downstream_model.rstrip()
261
  params = None
262
 
263
-
264
-
265
-
266
  try:
267
  if not selected_models:
268
  return "Please select at least one enabled model."
269
 
270
- if task_type == "Classification":
271
- global roc_auc, fpr, tpr, x_batch, y_batch
272
- elif task_type == "Regression":
273
- global RMSE, y_batch_test, y_prob
274
-
275
  if len(selected_models) > 1:
276
  if task_type == "Classification":
277
- #result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
278
- # downstream_model="XGBClassifier",
279
- # dataset=dataset.lower())
280
  if downstream_model == "Default Settings":
281
  downstream_model = "DefaultClassifier"
282
  params = None
283
- result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
284
- downstream_model=downstream_model,
285
- params = params,
286
- dataset=dataset)
287
 
288
- elif task_type == "Regression":
289
- #result, RMSE, y_batch_test, y_prob = fm4m.multi_modal(model_list=selected_models,
290
- # downstream_model="XGBRegressor",
291
- # dataset=dataset.lower())
 
 
 
 
 
 
 
 
 
292
 
 
293
  if downstream_model == "Default Settings":
294
  downstream_model = "DefaultRegressor"
295
  params = None
296
 
297
- result, RMSE, y_batch_test, y_prob, x_batch, y_batch = fm4m.multi_modal(model_list=selected_models,
298
- downstream_model=downstream_model,
299
- params=params,
300
- dataset=dataset)
 
 
 
 
 
 
 
 
 
301
 
302
  else:
303
  if task_type == "Classification":
304
- #result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
305
- # downstream_model="XGBClassifier",
306
- # dataset=dataset.lower())
307
  if downstream_model == "Default Settings":
308
  downstream_model = "DefaultClassifier"
309
  params = None
310
 
311
- result, roc_auc, fpr, tpr, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
312
- downstream_model=downstream_model,
313
- params=params,
314
- dataset=dataset)
 
 
 
 
 
 
 
 
 
315
 
316
  elif task_type == "Regression":
317
- #result, RMSE, y_batch_test, y_prob = fm4m.single_modal(model=selected_models[0],
318
- # downstream_model="XGBRegressor",
319
- # dataset=dataset.lower())
320
-
321
  if downstream_model == "Default Settings":
322
  downstream_model = "DefaultRegressor"
323
  params = None
324
 
325
- result, RMSE, y_batch_test, y_prob, x_batch, y_batch = fm4m.single_modal(model=selected_models[0],
326
- downstream_model=downstream_model,
327
- params=params,
328
- dataset=dataset)
 
 
 
 
 
 
 
 
 
329
 
330
  if result == None:
331
  result = "Data & Model Setting is incorrect"
@@ -335,23 +333,15 @@ def display_eval(selected_models, dataset, task_type, downstream, fusion_type):
335
 
336
 
337
  # Function to handle plot display
338
- def display_plot(plot_type):
339
  fig, ax = plt.subplots()
340
 
341
  if plot_type == "Latent Space":
342
- global x_batch, y_batch
343
  ax.set_title("T-SNE Plot")
344
- # reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
345
- # features_umap = reducer.fit_transform(x_batch[:500])
346
- # x = y_batch.values[:500]
347
- # index_0 = [index for index in range(len(x)) if x[index] == 0]
348
- # index_1 = [index for index in range(len(x)) if x[index] == 1]
349
- class_0 = x_batch # features_umap[index_0]
350
- class_1 = y_batch # features_umap[index_1]
351
-
352
- """with open("latent_multi_bace.pkl", "rb") as f:
353
- class_0, class_1 = pickle.load(f)
354
- """
355
  plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
356
  plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
357
 
@@ -360,10 +350,16 @@ def display_plot(plot_type):
360
  ax.set_title('Dataset Distribution')
361
 
362
  elif plot_type == "ROC-AUC":
363
- global roc_auc, fpr, tpr
364
  ax.set_title("ROC-AUC Curve")
365
  try:
366
- ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
 
 
 
 
 
 
367
  ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
368
  ax.set_xlim([0.0, 1.0])
369
  ax.set_ylim([0.0, 1.05])
@@ -375,7 +371,11 @@ def display_plot(plot_type):
375
  ax.legend(loc='lower right')
376
 
377
  elif plot_type == "Parity Plot":
378
- global RMSE, y_batch_test, y_prob
 
 
 
 
379
  ax.set_title("Parity plot")
380
 
381
  # change format
@@ -384,7 +384,12 @@ def display_plot(plot_type):
384
  print(y_prob)
385
  y_batch_test = np.array(y_batch_test, dtype=float)
386
  y_prob = np.array(y_prob, dtype=float)
387
- ax.scatter(y_batch_test, y_prob, color="blue", label=f"Predicted vs Actual (RMSE: {RMSE:.4f})")
 
 
 
 
 
388
  min_val = min(min(y_batch_test), min(y_prob))
389
  max_val = max(max(y_batch_test), max(y_prob))
390
  ax.plot([min_val, max_val], [min_val, max_val], 'r-')
@@ -397,10 +402,6 @@ def display_plot(plot_type):
397
  print(y_batch_test)
398
  print(y_prob)
399
 
400
-
401
-
402
-
403
-
404
  ax.set_xlabel('Actual Values')
405
  ax.set_ylabel('Predicted Values')
406
 
@@ -419,13 +420,25 @@ predefined_datasets = {
419
  # Function to load a predefined dataset from the local path
420
  def load_predefined_dataset(dataset_name):
421
  val = predefined_datasets.get(dataset_name)
422
- try: file_path = val.split(",")[0]
423
- except:file_path=False
 
 
424
 
425
  if file_path:
426
  df = pd.read_csv(file_path)
427
- return df.head(), gr.update(choices=list(df.columns)), gr.update(choices=list(df.columns)), f"{dataset_name.lower()}"
428
- return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), f"Dataset not found"
 
 
 
 
 
 
 
 
 
 
429
 
430
 
431
  # Function to display the head of the uploaded CSV file
@@ -433,7 +446,11 @@ def display_csv_head(file):
433
  if file is not None:
434
  # Load the CSV file into a DataFrame
435
  df = pd.read_csv(file.name)
436
- return df.head(), gr.update(choices=list(df.columns)), gr.update(choices=list(df.columns))
 
 
 
 
437
  return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
438
 
439
 
@@ -441,28 +458,54 @@ def display_csv_head(file):
441
  def handle_dataset_selection(selected_dataset):
442
  if selected_dataset == "Custom Dataset":
443
  # Show file upload fields for train and test datasets if "Custom Dataset" is selected
444
- return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(
445
- visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
 
 
 
 
 
 
 
 
446
  else:
447
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(
448
- visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
 
 
 
 
 
 
 
 
449
 
450
 
451
  # Function to select input and output columns and display a message
452
- def select_columns(input_column, output_column, train_data, test_data,dataset_name):
453
  if input_column and output_column:
454
  return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
455
  return "Please select both input and output columns."
456
 
457
- def set_dataname(dataset_name, dataset_selector ):
 
458
  if dataset_selector == "Custom Dataset":
459
  return f"{dataset_name}"
460
  return f"{dataset_selector}"
461
 
 
462
  # Function to create model based on user input
463
- def create_model(model_name, max_depth=None, n_estimators=None, alpha=None, degree=None, kernel=None):
 
 
464
  if model_name == "XGBClassifier":
465
- model = xgb.XGBClassifier(objective='binary:logistic',eval_metric= 'auc', max_depth=max_depth, n_estimators=n_estimators, alpha=alpha)
 
 
 
 
 
 
466
  elif model_name == "SVR":
467
  model = SVR(degree=degree, kernel=kernel)
468
  elif model_name == "Kernel Ridge":
@@ -476,224 +519,339 @@ def create_model(model_name, max_depth=None, n_estimators=None, alpha=None, degr
476
  return "Model not supported."
477
 
478
  return f"{model_name} * {model.get_params()}"
479
- def model_selector(model_name):
480
- # Dynamically return the appropriate hyperparameter components based on the selected model
481
- if model_name == "XGBClassifier":
482
- return (
483
- gr.Slider(1, 10, label="max_depth"),
484
- gr.Slider(50, 500, label="n_estimators"),
485
- gr.Slider(0.1, 10.0, step=0.1, label="alpha")
486
- )
487
- elif model_name == "SVR":
488
- return (
489
- gr.Slider(1, 5, label="degree"),
490
- gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
491
- )
492
- elif model_name == "Kernel Ridge":
493
- return (
494
- gr.Slider(0.1, 10.0, step=0.1, label="alpha"),
495
- gr.Slider(1, 5, label="degree"),
496
- gr.Dropdown(["rbf", "poly", "linear"], label="kernel")
497
- )
498
- elif model_name == "Linear Regression":
499
- return () # No hyperparameters for Linear Regression
500
- else:
501
- return ()
502
-
503
 
504
 
505
  # Define the Gradio layout
506
- # with gr.Blocks(theme=my_theme) as demo:
507
  with gr.Blocks() as demo:
 
 
 
 
508
  with gr.Row():
509
  # Left Column
510
  with gr.Column():
511
- gr.HTML('''
 
512
  <div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
513
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
514
  </div>
515
- ''')
516
- # gr.Markdown("## Data & Model Setting")
517
- #dataset_dropdown = gr.Dropdown(choices=datasets, label="Select Dat")
518
-
519
  # Dropdown menu for predefined datasets including "Custom Dataset" option
520
- dataset_selector = gr.Dropdown(label="Select Dataset",
521
- choices=list(predefined_datasets.keys()) + ["Custom Dataset"])
 
 
522
  # Display the message for selected columns
523
- selected_columns_message = gr.Textbox(label="Selected Columns Info", visible=False)
 
 
524
 
525
  with gr.Accordion("Dataset Settings", open=True):
526
  # File upload options for custom dataset (train and test)
527
  dataset_name = gr.Textbox(label="Dataset Name", visible=False)
528
- train_file = gr.File(label="Upload Custom Train Dataset", file_types=[".csv"], visible=False)
529
- train_display = gr.Dataframe(label="Train Dataset Preview (First 5 Rows)", visible=False, interactive=False)
 
 
 
 
 
 
 
 
530
 
531
- test_file = gr.File(label="Upload Custom Test Dataset", file_types=[".csv"], visible=False)
532
- test_display = gr.Dataframe(label="Test Dataset Preview (First 5 Rows)", visible=False, interactive=False)
 
 
 
 
 
 
 
 
533
 
534
  # Predefined dataset displays
535
- predefined_display = gr.Dataframe(label="Predefined Dataset Preview (First 5 Rows)", visible=False,
536
- interactive=False)
537
-
538
-
 
539
 
540
  # Dropdowns for selecting input and output columns for the custom dataset
541
- input_column_selector = gr.Dropdown(label="Select Input Column", choices=[], visible=False)
542
- output_column_selector = gr.Dropdown(label="Select Output Column", choices=[], visible=False)
543
-
544
- #selected_columns_message = gr.Textbox(label="Selected Columns Info", visible=True)
 
 
545
 
546
  # When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
547
- dataset_selector.change(handle_dataset_selection,
548
- inputs=dataset_selector,
549
- outputs=[dataset_name, train_file, train_display, test_file, test_display, predefined_display,
550
- input_column_selector, output_column_selector])
 
 
 
 
 
 
 
 
 
 
551
 
552
  # When a predefined dataset is selected, load its head and update column selectors
553
- dataset_selector.change(load_predefined_dataset,
554
- inputs=dataset_selector,
555
- outputs=[predefined_display, input_column_selector, output_column_selector, selected_columns_message])
 
 
 
 
 
 
 
556
 
557
  # When a custom train file is uploaded, display its head and update column selectors
558
- train_file.change(display_csv_head, inputs=train_file,
559
- outputs=[train_display, input_column_selector, output_column_selector])
 
 
 
 
 
 
 
560
 
561
  # When a custom test file is uploaded, display its head
562
- test_file.change(display_csv_head, inputs=test_file,
563
- outputs=[test_display, input_column_selector, output_column_selector])
 
 
 
 
 
 
 
564
 
565
- dataset_selector.change(set_dataname,
566
- inputs=[dataset_name, dataset_selector],
567
- outputs=dataset_name)
 
 
568
 
569
  # Update the selected columns information when dropdown values are changed
570
- input_column_selector.change(select_columns,
571
- inputs=[input_column_selector, output_column_selector, train_file, test_file, dataset_name],
572
- outputs=selected_columns_message)
573
-
574
- output_column_selector.change(select_columns,
575
- inputs=[input_column_selector, output_column_selector, train_file, test_file, dataset_name],
576
- outputs=selected_columns_message)
 
 
 
 
577
 
578
- model_checkbox = gr.CheckboxGroup(choices=models_enabled, label="Select Model")
 
 
 
 
 
 
 
 
 
 
579
 
580
- # Add disabled checkboxes for GNN and FNN
581
- # gnn_checkbox = gr.Checkbox(label="GNN (Disabled)", value=False, interactive=False)
582
- # fnn_checkbox = gr.Checkbox(label="FNN (Disabled)", value=False, interactive=False)
583
 
584
- task_radiobutton = gr.Radio(choices=["Classification", "Regression"], label="Task Type")
 
 
585
 
586
  ####### adding hyper parameter tuning ###########
587
- model_name = gr.Dropdown(["Default - Auto", "XGBClassifier", "SVR", "Kernel Ridge", "Linear Regression"], label="Select Downstream Model")
 
 
 
 
 
 
 
 
 
588
  with gr.Accordion("Downstream Hyperparameter Settings", open=True):
589
  # Create placeholders for hyperparameter components
590
- max_depth = gr.Slider(1, 20, step=1,visible=False, label="max_depth")
591
- n_estimators = gr.Slider(100, 5000, step=100, visible=False, label="n_estimators")
 
 
592
  alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
593
- degree = gr.Slider(1, 20, step=1,visible=False, label="degree")
594
- kernel = gr.Dropdown(choices=["rbf", "poly", "linear"], visible=False, label="kernel")
 
 
595
 
596
  # Output textbox
597
  output = gr.Textbox(label="Loaded Parameters")
598
 
599
-
600
  # Dynamically show relevant hyperparameters based on selected model
601
  def update_hyperparameters(model_name):
602
  if model_name == "XGBClassifier":
603
- return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(
604
- visible=False), gr.update(visible=False)
 
 
 
 
 
605
  elif model_name == "SVR":
606
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
607
- visible=True), gr.update(visible=True)
 
 
 
 
 
608
  elif model_name == "Kernel Ridge":
609
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(
610
- visible=True), gr.update(visible=True)
 
 
 
 
 
611
  elif model_name == "Linear Regression":
612
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
613
- visible=False), gr.update(visible=False)
 
 
 
 
 
614
  elif model_name == "Default - Auto":
615
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
616
- visible=False), gr.update(visible=False)
617
-
 
 
 
 
618
 
619
  # When model is selected, update which hyperparameters are visible
620
- model_name.change(update_hyperparameters, inputs=[model_name],
621
- outputs=[max_depth, n_estimators, alpha, degree, kernel])
 
 
 
622
 
623
  # Submit button to create the model with selected hyperparameters
624
  submit_button = gr.Button("Create Downstream Model")
625
 
626
-
627
  # Function to handle model creation based on input parameters
628
  def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
629
  if model_name == "XGBClassifier":
630
- return create_model(model_name, max_depth=max_depth, n_estimators=n_estimators, alpha=alpha)
 
 
 
 
 
631
  elif model_name == "SVR":
632
  return create_model(model_name, degree=degree, kernel=kernel)
633
  elif model_name == "Kernel Ridge":
634
- return create_model(model_name, alpha=alpha, degree=degree, kernel=kernel)
 
 
635
  elif model_name == "Linear Regression":
636
  return create_model(model_name)
637
  elif model_name == "Default - Auto":
638
  return create_model(model_name)
639
 
640
  # When the submit button is clicked, run the on_submit function
641
- submit_button.click(on_submit, inputs=[model_name, max_depth, n_estimators, alpha, degree, kernel],
642
- outputs=output)
 
 
 
643
  ###### End of hyper param tuning #########
644
 
645
  fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
646
 
647
-
648
-
649
  eval_button = gr.Button("Train downstream model")
650
- #eval_button.style(css_class="custom-button-left")
651
 
652
  # Middle Column
653
  with gr.Column():
654
- gr.HTML('''
 
655
  <div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
656
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
657
  </div>
658
- ''')
659
- # gr.Markdown("## Downstream task Result")
660
  eval_output = gr.Textbox(label="Train downstream model")
661
 
662
- plot_radio = gr.Radio(choices=["ROC-AUC", "Parity Plot", "Latent Space"], label="Select Plot Type")
663
- plot_output = gr.Plot(label="Visualization")#, height=250, width=250)
664
-
665
- #download_rep = gr.Button("Download representation")
 
666
 
667
  create_log = gr.Button("Store log")
668
 
669
- log_table = gr.Dataframe(value=log_df, label="Log of Selections and Results", interactive=False)
670
-
671
- eval_button.click(display_eval,
672
- inputs=[model_checkbox, selected_columns_message, task_radiobutton, output, fusion_radiobutton],
673
- outputs=eval_output)
674
-
675
- plot_radio.change(display_plot, inputs=plot_radio, outputs=plot_output)
676
-
 
 
 
 
 
 
 
 
 
 
 
 
677
 
678
  # Function to gather selected models
679
  def gather_selected_models(*models):
680
  selected = [model for model in models if model]
681
  return selected
682
 
683
-
684
- create_log.click(evaluate_and_log, inputs=[model_checkbox, dataset_name, task_radiobutton, eval_output],
685
- outputs=log_table)
686
- #download_rep.click(save_rep, inputs=[model_checkbox, dataset_name, task_radiobutton, eval_output],
687
- # outputs=None)
688
-
 
 
 
 
 
689
  # Right Column
690
  with gr.Column():
691
- gr.HTML('''
 
692
  <div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
693
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
694
  </div>
695
- ''')
696
- # gr.Markdown("## Molecular Generation")
697
  smiles_input = gr.Textbox(label="Input SMILES String")
698
  image_display = gr.Image(label="Molecule Image", height=250, width=250)
699
  # Show images for selection
@@ -702,24 +860,32 @@ with gr.Blocks() as demo:
702
  choices=list(smiles_image_mapping.keys()),
703
  label="Select from sample molecules",
704
  value=None,
705
- #item_images=[load_image(smiles_image_mapping[key]["image"]) for key in smiles_image_mapping.keys()]
706
  )
707
  image_selector.change(load_image, image_selector, image_display)
708
  generate_button = gr.Button("Generate")
709
- gen_image_display = gr.Image(label="Generated Molecule Image", height=250, width=250)
 
 
710
  generated_output = gr.Textbox(label="Generated Output")
711
  property_table = gr.Dataframe(label="Molecular Properties Comparison")
712
 
713
-
714
-
715
  # Handle image selection
716
- image_selector.change(handle_image_selection, inputs=image_selector, outputs=[smiles_input, image_display])
717
- smiles_input.change(smiles_to_image, inputs=smiles_input, outputs=image_display)
 
 
 
 
 
 
718
 
719
  # Generate button to display canonical SMILES and molecule image
720
- generate_button.click(generate_canonical, inputs=smiles_input,
721
- outputs=[property_table, generated_output, gen_image_display])
 
 
 
722
 
723
 
724
  if __name__ == "__main__":
725
- demo.launch(share=True)
 
1
  import gradio as gr
 
2
  import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import os
 
5
  import pandas as pd
6
+ import re
 
 
 
7
  import selfies as sf
 
8
  import torch
 
 
 
9
  import xgboost as xgb
10
+ from PIL import Image
11
+ from rdkit import Chem, RDLogger
12
+ from rdkit.Chem import DataStructs, AllChem, Descriptors, QED, Draw
13
+ from rdkit.Chem.Crippen import MolLogP
14
+ from rdkit.Contrib.SA_Score import sascorer
15
  from sklearn.kernel_ridge import KernelRidge
16
+ from sklearn.linear_model import LinearRegression
17
+ from sklearn.svm import SVR
18
+ from transformers import BartForConditionalGeneration, AutoTokenizer
19
+ from transformers.modeling_outputs import BaseModelOutput
20
 
21
  os.environ["OMP_MAX_ACTIVE_LEVELS"] = "1"
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  import models.fm4m as fm4m
24
 
25
+ RDLogger.logger().setLevel(RDLogger.ERROR)
26
+
27
 
28
  # Function to display molecule image from SMILES
29
  def smiles_to_image(smiles):
30
  mol = Chem.MolFromSmiles(smiles)
31
+ return Draw.MolToImage(mol) if mol else None
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
  # Dictionary for SMILES strings and corresponding images (you can replace with your actual image paths)
35
  smiles_image_mapping = {
36
+ "Mol 1": {
37
+ "smiles": "C=C(C)CC(=O)NC[C@H](CO)NC(=O)C=Cc1ccc(C)c(Cl)c1",
38
+ "image": "img/img1.png",
39
+ },
40
  # Example SMILES for ethanol
41
+ "Mol 2": {
42
+ "smiles": "C=CC1(CC(=O)NC[C@@H](CCCC)NC(=O)c2cc(Cl)cc(Br)c2)CC1",
43
+ "image": "img/img2.png",
44
+ },
45
  # Example SMILES for butane
46
+ "Mol 3": {
47
+ "smiles": "C=C(C)C[C@H](NC(C)=O)C(=O)N1CC[C@H](NC(=O)[C@H]2C[C@@]2(C)Br)C(C)(C)C1",
48
+ "image": "img/img3.png",
49
+ }, # Example SMILES for ethylamine
50
+ "Mol 4": {
51
+ "smiles": "C=C1CC(CC(=O)N[C@H]2CCN(C(=O)c3ncccc3SC)C23CC3)C1",
52
+ "image": "img/img4.png",
53
+ },
54
  # Example SMILES for diethyl ether
55
+ "Mol 5": {
56
+ "smiles": "C=CCS[C@@H](C)CC(=O)OCC",
57
+ "image": "img/img5.png",
58
+ }, # Example SMILES for chloroethane
59
  }
60
 
61
  datasets = [" ", "BACE", "ESOL", "Load Custom Dataset"]
62
 
63
+ models_enabled = [
64
+ "SELFIES-TED",
65
+ "MHG-GED",
66
+ "MolFormer",
67
+ "SMI-TED",
68
+ "Mordred",
69
+ "MorganFingerprint",
70
+ ]
71
 
72
  fusion_available = ["Concat"]
73
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Function to handle evaluation and logging
76
+ def evaluate_and_log(models, dataset, task_type, eval_output, state):
 
 
77
  task_dic = {'Classification': 'CLS', 'Regression': 'RGR'}
78
+ result = f"{eval_output}"
79
  result = result.replace(" Score", "")
80
 
81
+ new_entry = {
82
+ "Selected Models": str(models),
83
+ "Dataset": dataset,
84
+ "Task": task_dic[task_type],
85
+ "Result": result,
86
+ }
87
  new_entry_df = pd.DataFrame([new_entry])
88
 
89
+ state["log_df"] = pd.concat([new_entry_df, state["log_df"]])
90
+ return state["log_df"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
 
93
  # Load images for selection
94
  def load_image(path):
95
  try:
96
+ return Image.open(smiles_image_mapping[path]["image"])
97
  except:
98
  pass
99
 
100
 
 
101
  # Function to handle image selection
102
  def handle_image_selection(image_key):
103
  smiles = smiles_image_mapping[image_key]["smiles"]
 
121
  mol1 = Chem.MolFromSmiles(smiles1)
122
  mol2 = Chem.MolFromSmiles(smiles2)
123
  if mol1 and mol2:
 
 
124
  fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
125
  fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
126
  return round(DataStructs.FingerprintSimilarity(fp1, fp2), 2)
127
  return None
128
 
129
 
 
 
 
130
  gen_tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
131
  gen_model = BartForConditionalGeneration.from_pretrained("ibm/materials.selfies-ted")
132
 
133
 
134
  def generate(latent_vector, mask):
135
  encoder_outputs = BaseModelOutput(latent_vector)
136
+ decoder_output = gen_model.generate(
137
+ encoder_outputs=encoder_outputs,
138
+ attention_mask=mask,
139
+ max_new_tokens=64,
140
+ do_sample=True,
141
+ top_k=5,
142
+ top_p=0.95,
143
+ num_return_sequences=1,
144
+ )
145
  selfies = gen_tokenizer.batch_decode(decoder_output, skip_special_tokens=True)
146
+ return [sf.decoder(re.sub(r'\]\s*(.*?)\s*\[', r']\1[', i)) for i in selfies]
 
 
 
147
 
148
 
149
  def perturb_latent(latent_vecs, noise_scale=0.5):
150
+ return (
151
+ torch.tensor(
152
+ np.random.uniform(0, 1, latent_vecs.shape) * noise_scale,
153
+ dtype=torch.float32,
154
+ )
155
+ + latent_vecs
156
+ )
157
 
158
 
159
  def encode(selfies):
160
+ encoding = gen_tokenizer(
161
+ selfies,
162
+ return_tensors='pt',
163
+ max_length=128,
164
+ truncation=True,
165
+ padding='max_length',
166
+ )
167
  input_ids = encoding['input_ids']
168
  attention_mask = encoding['attention_mask']
169
+ outputs = gen_model.model.encoder(
170
+ input_ids=input_ids, attention_mask=attention_mask
171
+ )
172
  model_output = outputs.last_hidden_state
 
 
 
 
 
173
  return model_output, attention_mask
174
 
175
 
 
184
  noise = i / 10
185
  perturbed_latent = perturb_latent(latent_vec, noise_scale=noise)
186
  gen = generate(perturbed_latent, mask)
187
+ mol = Chem.MolFromSmiles(gen[0])
188
+ if mol:
189
+ gen_mol = Chem.MolToSmiles(mol)
190
+ if gen_mol != Chem.MolToSmiles(Chem.MolFromSmiles(smiles)):
191
+ break
192
+ else:
193
+ print('Abnormal molecule:', gen[0])
194
 
195
  if gen_mol:
196
  # Calculate properties for ref and gen molecules
 
202
  # Prepare the table with ref mol and gen mol
203
  data = {
204
  "Property": ["QED", "SA", "LogP", "Mol Wt", "Tanimoto Similarity"],
205
+ "Reference Mol": [
206
+ ref_properties[0],
207
+ ref_properties[1],
208
+ ref_properties[2],
209
+ ref_properties[3],
210
+ tanimoto_similarity,
211
+ ],
212
+ "Generated Mol": [
213
+ gen_properties[0],
214
+ gen_properties[1],
215
+ gen_properties[2],
216
+ gen_properties[3],
217
+ "",
218
+ ],
219
  }
220
  df = pd.DataFrame(data)
221
 
 
228
 
229
 
230
  # Function to display evaluation score
231
+ def display_eval(selected_models, dataset, task_type, downstream, fusion_type, state):
232
  result = None
233
 
234
  try:
 
243
  downstream_model = downstream_model.rstrip()
244
  params = None
245
 
 
 
 
246
  try:
247
  if not selected_models:
248
  return "Please select at least one enabled model."
249
 
 
 
 
 
 
250
  if len(selected_models) > 1:
251
  if task_type == "Classification":
 
 
 
252
  if downstream_model == "Default Settings":
253
  downstream_model = "DefaultClassifier"
254
  params = None
 
 
 
 
255
 
256
+ (
257
+ result,
258
+ state["roc_auc"],
259
+ state["fpr"],
260
+ state["tpr"],
261
+ state["x_batch"],
262
+ state["y_batch"],
263
+ ) = fm4m.multi_modal(
264
+ model_list=selected_models,
265
+ downstream_model=downstream_model,
266
+ params=params,
267
+ dataset=dataset,
268
+ )
269
 
270
+ elif task_type == "Regression":
271
  if downstream_model == "Default Settings":
272
  downstream_model = "DefaultRegressor"
273
  params = None
274
 
275
+ (
276
+ result,
277
+ state["RMSE"],
278
+ state["y_batch_test"],
279
+ state["y_prob"],
280
+ state["x_batch"],
281
+ state["y_batch"],
282
+ ) = fm4m.multi_modal(
283
+ model_list=selected_models,
284
+ downstream_model=downstream_model,
285
+ params=params,
286
+ dataset=dataset,
287
+ )
288
 
289
  else:
290
  if task_type == "Classification":
 
 
 
291
  if downstream_model == "Default Settings":
292
  downstream_model = "DefaultClassifier"
293
  params = None
294
 
295
+ (
296
+ result,
297
+ state["roc_auc"],
298
+ state["fpr"],
299
+ state["tpr"],
300
+ state["x_batch"],
301
+ state["y_batch"],
302
+ ) = fm4m.single_modal(
303
+ model=selected_models[0],
304
+ downstream_model=downstream_model,
305
+ params=params,
306
+ dataset=dataset,
307
+ )
308
 
309
  elif task_type == "Regression":
 
 
 
 
310
  if downstream_model == "Default Settings":
311
  downstream_model = "DefaultRegressor"
312
  params = None
313
 
314
+ (
315
+ result,
316
+ state["RMSE"],
317
+ state["y_batch_test"],
318
+ state["y_prob"],
319
+ state["x_batch"],
320
+ state["y_batch"],
321
+ ) = fm4m.single_modal(
322
+ model=selected_models[0],
323
+ downstream_model=downstream_model,
324
+ params=params,
325
+ dataset=dataset,
326
+ )
327
 
328
  if result == None:
329
  result = "Data & Model Setting is incorrect"
 
333
 
334
 
335
  # Function to handle plot display
336
+ def display_plot(plot_type, state):
337
  fig, ax = plt.subplots()
338
 
339
  if plot_type == "Latent Space":
340
+ x_batch, y_batch = state.get("x_batch"), state.get("y_batch")
341
  ax.set_title("T-SNE Plot")
342
+ class_0 = x_batch
343
+ class_1 = y_batch
344
+
 
 
 
 
 
 
 
 
345
  plt.scatter(class_1[:, 0], class_1[:, 1], c='red', label='Class 1')
346
  plt.scatter(class_0[:, 0], class_0[:, 1], c='blue', label='Class 0')
347
 
 
350
  ax.set_title('Dataset Distribution')
351
 
352
  elif plot_type == "ROC-AUC":
353
+ roc_auc, fpr, tpr = state.get("roc_auc"), state.get("fpr"), state.get("tpr")
354
  ax.set_title("ROC-AUC Curve")
355
  try:
356
+ ax.plot(
357
+ fpr,
358
+ tpr,
359
+ color='darkorange',
360
+ lw=2,
361
+ label=f'ROC curve (area = {roc_auc:.4f})',
362
+ )
363
  ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
364
  ax.set_xlim([0.0, 1.0])
365
  ax.set_ylim([0.0, 1.05])
 
371
  ax.legend(loc='lower right')
372
 
373
  elif plot_type == "Parity Plot":
374
+ RMSE, y_batch_test, y_prob = (
375
+ state.get("RMSE"),
376
+ state.get("y_batch_test"),
377
+ state.get("y_prob"),
378
+ )
379
  ax.set_title("Parity plot")
380
 
381
  # change format
 
384
  print(y_prob)
385
  y_batch_test = np.array(y_batch_test, dtype=float)
386
  y_prob = np.array(y_prob, dtype=float)
387
+ ax.scatter(
388
+ y_batch_test,
389
+ y_prob,
390
+ color="blue",
391
+ label=f"Predicted vs Actual (RMSE: {RMSE:.4f})",
392
+ )
393
  min_val = min(min(y_batch_test), min(y_prob))
394
  max_val = max(max(y_batch_test), max(y_prob))
395
  ax.plot([min_val, max_val], [min_val, max_val], 'r-')
 
402
  print(y_batch_test)
403
  print(y_prob)
404
 
 
 
 
 
405
  ax.set_xlabel('Actual Values')
406
  ax.set_ylabel('Predicted Values')
407
 
 
420
  # Function to load a predefined dataset from the local path
421
  def load_predefined_dataset(dataset_name):
422
  val = predefined_datasets.get(dataset_name)
423
+ try:
424
+ file_path = val.split(",")[0]
425
+ except:
426
+ file_path = False
427
 
428
  if file_path:
429
  df = pd.read_csv(file_path)
430
+ return (
431
+ df.head(),
432
+ gr.update(choices=list(df.columns)),
433
+ gr.update(choices=list(df.columns)),
434
+ f"{dataset_name.lower()}",
435
+ )
436
+ return (
437
+ pd.DataFrame(),
438
+ gr.update(choices=[]),
439
+ gr.update(choices=[]),
440
+ f"Dataset not found",
441
+ )
442
 
443
 
444
  # Function to display the head of the uploaded CSV file
 
446
  if file is not None:
447
  # Load the CSV file into a DataFrame
448
  df = pd.read_csv(file.name)
449
+ return (
450
+ df.head(),
451
+ gr.update(choices=list(df.columns)),
452
+ gr.update(choices=list(df.columns)),
453
+ )
454
  return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[])
455
 
456
 
 
458
  def handle_dataset_selection(selected_dataset):
459
  if selected_dataset == "Custom Dataset":
460
  # Show file upload fields for train and test datasets if "Custom Dataset" is selected
461
+ return (
462
+ gr.update(visible=True),
463
+ gr.update(visible=True),
464
+ gr.update(visible=True),
465
+ gr.update(visible=True),
466
+ gr.update(visible=True),
467
+ gr.update(visible=False),
468
+ gr.update(visible=True),
469
+ gr.update(visible=True),
470
+ )
471
  else:
472
+ return (
473
+ gr.update(visible=True),
474
+ gr.update(visible=False),
475
+ gr.update(visible=False),
476
+ gr.update(visible=False),
477
+ gr.update(visible=False),
478
+ gr.update(visible=False),
479
+ gr.update(visible=False),
480
+ gr.update(visible=False),
481
+ )
482
 
483
 
484
  # Function to select input and output columns and display a message
485
+ def select_columns(input_column, output_column, train_data, test_data, dataset_name):
486
  if input_column and output_column:
487
  return f"{train_data.name},{test_data.name},{input_column},{output_column},{dataset_name}"
488
  return "Please select both input and output columns."
489
 
490
+
491
+ def set_dataname(dataset_name, dataset_selector):
492
  if dataset_selector == "Custom Dataset":
493
  return f"{dataset_name}"
494
  return f"{dataset_selector}"
495
 
496
+
497
  # Function to create model based on user input
498
+ def create_model(
499
+ model_name, max_depth=None, n_estimators=None, alpha=None, degree=None, kernel=None
500
+ ):
501
  if model_name == "XGBClassifier":
502
+ model = xgb.XGBClassifier(
503
+ objective='binary:logistic',
504
+ eval_metric='auc',
505
+ max_depth=max_depth,
506
+ n_estimators=n_estimators,
507
+ alpha=alpha,
508
+ )
509
  elif model_name == "SVR":
510
  model = SVR(degree=degree, kernel=kernel)
511
  elif model_name == "Kernel Ridge":
 
519
  return "Model not supported."
520
 
521
  return f"{model_name} * {model.get_params()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
 
524
  # Define the Gradio layout
 
525
  with gr.Blocks() as demo:
526
+ log_df = pd.DataFrame(
527
+ {"": [], 'Selected Models': [], 'Dataset': [], 'Task': [], 'Result': []}
528
+ )
529
+ state = gr.State({"log_df": log_df})
530
  with gr.Row():
531
  # Left Column
532
  with gr.Column():
533
+ gr.HTML(
534
+ '''
535
  <div style="background-color: #6A8EAE; color: #FFFFFF; padding: 10px;">
536
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Data & Model Setting</h3>
537
  </div>
538
+ '''
539
+ )
 
 
540
  # Dropdown menu for predefined datasets including "Custom Dataset" option
541
+ dataset_selector = gr.Dropdown(
542
+ label="Select Dataset",
543
+ choices=list(predefined_datasets.keys()) + ["Custom Dataset"],
544
+ )
545
  # Display the message for selected columns
546
+ selected_columns_message = gr.Textbox(
547
+ label="Selected Columns Info", visible=False
548
+ )
549
 
550
  with gr.Accordion("Dataset Settings", open=True):
551
  # File upload options for custom dataset (train and test)
552
  dataset_name = gr.Textbox(label="Dataset Name", visible=False)
553
+ train_file = gr.File(
554
+ label="Upload Custom Train Dataset",
555
+ file_types=[".csv"],
556
+ visible=False,
557
+ )
558
+ train_display = gr.Dataframe(
559
+ label="Train Dataset Preview (First 5 Rows)",
560
+ visible=False,
561
+ interactive=False,
562
+ )
563
 
564
+ test_file = gr.File(
565
+ label="Upload Custom Test Dataset",
566
+ file_types=[".csv"],
567
+ visible=False,
568
+ )
569
+ test_display = gr.Dataframe(
570
+ label="Test Dataset Preview (First 5 Rows)",
571
+ visible=False,
572
+ interactive=False,
573
+ )
574
 
575
  # Predefined dataset displays
576
+ predefined_display = gr.Dataframe(
577
+ label="Predefined Dataset Preview (First 5 Rows)",
578
+ visible=False,
579
+ interactive=False,
580
+ )
581
 
582
  # Dropdowns for selecting input and output columns for the custom dataset
583
+ input_column_selector = gr.Dropdown(
584
+ label="Select Input Column", choices=[], visible=False
585
+ )
586
+ output_column_selector = gr.Dropdown(
587
+ label="Select Output Column", choices=[], visible=False
588
+ )
589
 
590
  # When a dataset is selected, show either file upload fields (for custom) or load predefined datasets
591
+ dataset_selector.change(
592
+ handle_dataset_selection,
593
+ inputs=dataset_selector,
594
+ outputs=[
595
+ dataset_name,
596
+ train_file,
597
+ train_display,
598
+ test_file,
599
+ test_display,
600
+ predefined_display,
601
+ input_column_selector,
602
+ output_column_selector,
603
+ ],
604
+ )
605
 
606
  # When a predefined dataset is selected, load its head and update column selectors
607
+ dataset_selector.change(
608
+ load_predefined_dataset,
609
+ inputs=dataset_selector,
610
+ outputs=[
611
+ predefined_display,
612
+ input_column_selector,
613
+ output_column_selector,
614
+ selected_columns_message,
615
+ ],
616
+ )
617
 
618
  # When a custom train file is uploaded, display its head and update column selectors
619
+ train_file.change(
620
+ display_csv_head,
621
+ inputs=train_file,
622
+ outputs=[
623
+ train_display,
624
+ input_column_selector,
625
+ output_column_selector,
626
+ ],
627
+ )
628
 
629
  # When a custom test file is uploaded, display its head
630
+ test_file.change(
631
+ display_csv_head,
632
+ inputs=test_file,
633
+ outputs=[
634
+ test_display,
635
+ input_column_selector,
636
+ output_column_selector,
637
+ ],
638
+ )
639
 
640
+ dataset_selector.change(
641
+ set_dataname,
642
+ inputs=[dataset_name, dataset_selector],
643
+ outputs=dataset_name,
644
+ )
645
 
646
  # Update the selected columns information when dropdown values are changed
647
+ input_column_selector.change(
648
+ select_columns,
649
+ inputs=[
650
+ input_column_selector,
651
+ output_column_selector,
652
+ train_file,
653
+ test_file,
654
+ dataset_name,
655
+ ],
656
+ outputs=selected_columns_message,
657
+ )
658
 
659
+ output_column_selector.change(
660
+ select_columns,
661
+ inputs=[
662
+ input_column_selector,
663
+ output_column_selector,
664
+ train_file,
665
+ test_file,
666
+ dataset_name,
667
+ ],
668
+ outputs=selected_columns_message,
669
+ )
670
 
671
+ model_checkbox = gr.CheckboxGroup(
672
+ choices=models_enabled, label="Select Model"
673
+ )
674
 
675
+ task_radiobutton = gr.Radio(
676
+ choices=["Classification", "Regression"], label="Task Type"
677
+ )
678
 
679
  ####### adding hyper parameter tuning ###########
680
+ model_name = gr.Dropdown(
681
+ [
682
+ "Default - Auto",
683
+ "XGBClassifier",
684
+ "SVR",
685
+ "Kernel Ridge",
686
+ "Linear Regression",
687
+ ],
688
+ label="Select Downstream Model",
689
+ )
690
  with gr.Accordion("Downstream Hyperparameter Settings", open=True):
691
  # Create placeholders for hyperparameter components
692
+ max_depth = gr.Slider(1, 20, step=1, visible=False, label="max_depth")
693
+ n_estimators = gr.Slider(
694
+ 100, 5000, step=100, visible=False, label="n_estimators"
695
+ )
696
  alpha = gr.Slider(0.1, 10.0, step=0.1, visible=False, label="alpha")
697
+ degree = gr.Slider(1, 20, step=1, visible=False, label="degree")
698
+ kernel = gr.Dropdown(
699
+ choices=["rbf", "poly", "linear"], visible=False, label="kernel"
700
+ )
701
 
702
  # Output textbox
703
  output = gr.Textbox(label="Loaded Parameters")
704
 
 
705
  # Dynamically show relevant hyperparameters based on selected model
706
  def update_hyperparameters(model_name):
707
  if model_name == "XGBClassifier":
708
+ return (
709
+ gr.update(visible=True),
710
+ gr.update(visible=True),
711
+ gr.update(visible=True),
712
+ gr.update(visible=False),
713
+ gr.update(visible=False),
714
+ )
715
  elif model_name == "SVR":
716
+ return (
717
+ gr.update(visible=False),
718
+ gr.update(visible=False),
719
+ gr.update(visible=False),
720
+ gr.update(visible=True),
721
+ gr.update(visible=True),
722
+ )
723
  elif model_name == "Kernel Ridge":
724
+ return (
725
+ gr.update(visible=False),
726
+ gr.update(visible=False),
727
+ gr.update(visible=True),
728
+ gr.update(visible=True),
729
+ gr.update(visible=True),
730
+ )
731
  elif model_name == "Linear Regression":
732
+ return (
733
+ gr.update(visible=False),
734
+ gr.update(visible=False),
735
+ gr.update(visible=False),
736
+ gr.update(visible=False),
737
+ gr.update(visible=False),
738
+ )
739
  elif model_name == "Default - Auto":
740
+ return (
741
+ gr.update(visible=False),
742
+ gr.update(visible=False),
743
+ gr.update(visible=False),
744
+ gr.update(visible=False),
745
+ gr.update(visible=False),
746
+ )
747
 
748
  # When model is selected, update which hyperparameters are visible
749
+ model_name.change(
750
+ update_hyperparameters,
751
+ inputs=[model_name],
752
+ outputs=[max_depth, n_estimators, alpha, degree, kernel],
753
+ )
754
 
755
  # Submit button to create the model with selected hyperparameters
756
  submit_button = gr.Button("Create Downstream Model")
757
 
 
758
  # Function to handle model creation based on input parameters
759
  def on_submit(model_name, max_depth, n_estimators, alpha, degree, kernel):
760
  if model_name == "XGBClassifier":
761
+ return create_model(
762
+ model_name,
763
+ max_depth=max_depth,
764
+ n_estimators=n_estimators,
765
+ alpha=alpha,
766
+ )
767
  elif model_name == "SVR":
768
  return create_model(model_name, degree=degree, kernel=kernel)
769
  elif model_name == "Kernel Ridge":
770
+ return create_model(
771
+ model_name, alpha=alpha, degree=degree, kernel=kernel
772
+ )
773
  elif model_name == "Linear Regression":
774
  return create_model(model_name)
775
  elif model_name == "Default - Auto":
776
  return create_model(model_name)
777
 
778
  # When the submit button is clicked, run the on_submit function
779
+ submit_button.click(
780
+ on_submit,
781
+ inputs=[model_name, max_depth, n_estimators, alpha, degree, kernel],
782
+ outputs=output,
783
+ )
784
  ###### End of hyper param tuning #########
785
 
786
  fusion_radiobutton = gr.Radio(choices=fusion_available, label="Fusion Type")
787
 
 
 
788
  eval_button = gr.Button("Train downstream model")
 
789
 
790
  # Middle Column
791
  with gr.Column():
792
+ gr.HTML(
793
+ '''
794
  <div style="background-color: #8F9779; color: #FFFFFF; padding: 10px;">
795
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 1: Property Prediction</h3>
796
  </div>
797
+ '''
798
+ )
799
  eval_output = gr.Textbox(label="Train downstream model")
800
 
801
+ plot_radio = gr.Radio(
802
+ choices=["ROC-AUC", "Parity Plot", "Latent Space"],
803
+ label="Select Plot Type",
804
+ )
805
+ plot_output = gr.Plot(label="Visualization")
806
 
807
  create_log = gr.Button("Store log")
808
 
809
+ log_table = gr.Dataframe(
810
+ value=log_df, label="Log of Selections and Results", interactive=False
811
+ )
812
+
813
+ eval_button.click(
814
+ display_eval,
815
+ inputs=[
816
+ model_checkbox,
817
+ selected_columns_message,
818
+ task_radiobutton,
819
+ output,
820
+ fusion_radiobutton,
821
+ state,
822
+ ],
823
+ outputs=eval_output,
824
+ )
825
+
826
+ plot_radio.change(
827
+ display_plot, inputs=[plot_radio, state], outputs=plot_output
828
+ )
829
 
830
  # Function to gather selected models
831
  def gather_selected_models(*models):
832
  selected = [model for model in models if model]
833
  return selected
834
 
835
+ create_log.click(
836
+ evaluate_and_log,
837
+ inputs=[
838
+ model_checkbox,
839
+ dataset_name,
840
+ task_radiobutton,
841
+ eval_output,
842
+ state,
843
+ ],
844
+ outputs=log_table,
845
+ )
846
  # Right Column
847
  with gr.Column():
848
+ gr.HTML(
849
+ '''
850
  <div style="background-color: #D2B48C; color: #FFFFFF; padding: 10px;">
851
  <h3 style="color: #FFFFFF; margin: 0;font-size: 20px;"> Downstream Task 2: Molecule Generation</h3>
852
  </div>
853
+ '''
854
+ )
855
  smiles_input = gr.Textbox(label="Input SMILES String")
856
  image_display = gr.Image(label="Molecule Image", height=250, width=250)
857
  # Show images for selection
 
860
  choices=list(smiles_image_mapping.keys()),
861
  label="Select from sample molecules",
862
  value=None,
 
863
  )
864
  image_selector.change(load_image, image_selector, image_display)
865
  generate_button = gr.Button("Generate")
866
+ gen_image_display = gr.Image(
867
+ label="Generated Molecule Image", height=250, width=250
868
+ )
869
  generated_output = gr.Textbox(label="Generated Output")
870
  property_table = gr.Dataframe(label="Molecular Properties Comparison")
871
 
 
 
872
  # Handle image selection
873
+ image_selector.change(
874
+ handle_image_selection,
875
+ inputs=image_selector,
876
+ outputs=[smiles_input, image_display],
877
+ )
878
+ smiles_input.change(
879
+ smiles_to_image, inputs=smiles_input, outputs=image_display
880
+ )
881
 
882
  # Generate button to display canonical SMILES and molecule image
883
+ generate_button.click(
884
+ generate_canonical,
885
+ inputs=smiles_input,
886
+ outputs=[property_table, generated_output, gen_image_display],
887
+ )
888
 
889
 
890
  if __name__ == "__main__":
891
+ demo.launch(server_name="0.0.0.0")
data/lce/test.csv ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
2
+ C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0.0,O,0.0,O,0.0,O,0.0,1.629
3
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,1.085
4
+ COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0.0,O,0.0,O,0.0,2.056
5
+ COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],,O,0.0,O,0.0,1.658
6
+ C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0.0,1.638
7
+ C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.276
8
+ O1CCOC1,0.368,COCCOC,0.547,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.076,CSi(C)(C)([N+]).C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.008,O,0.0,O,0.0,1.569
9
+ COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0.0,O,0.0,O,0.0,2.268
10
+ C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,1.602
11
+ C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0.0,1.678
12
+ O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0.0,O,0.0,2.0
13
+ C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0.0,O,0.0,O,0.0,0.921
14
+ C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0.0,O,0.0,O,0.0,1.301
15
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0.0,O,0.0,0.854
16
+ C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0.0,O,0.0,1.108
17
+ O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0.0,O,0.0,O,0.0,1.523
18
+ CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0.0,O,0.0,O,0.0,O,0.0,1.921
19
+ CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0.0,O,0.0,O,0.0,O,0.0,1.602
20
+ O1CCOC1,0.375,COCCOC,0.557,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.061,O,0.0,1.523
21
+ COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0.0,O,0.0,O,0.0,2.155
22
+ C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.26
23
+ CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0.0,O,0.0,O,0.0,2.155
24
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.225
25
+ COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
26
+ COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0.0,O,0.0,O,0.0,2.155
27
+ O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0.0,O,0.0,O,0.0,2.301
28
+ COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0.0,O,0.0,O,0.0,O,0.0,1.991
29
+ COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,2.301
30
+ C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0.0,O,0.0,O,0.0,1.398
31
+ COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0.0,O,0.0,O,0.0,1.268
data/lce/test_data.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smiles1,conc1,mol1,smiles2,conc2,mol2,smiles3,conc3,mol3,smiles4,conc4,mol4,smiles5,conc5,mol5,smiles6,conc6,LCE_Predicted,LCE
2
+ C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.187,1.094
3
+ COCCOC,0.596,59.5609428,COCCOCCOCCOCCOC,0.281,28.07124115,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.124,12.36781605,O,0,0,O,0,0,O,0,1.691,1.384
4
+ C1COC(=O)O1,0.285,28.50894036,C1C(OC(=O)O1)F,0.261,26.07552384,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.228,22.82322096,COC(=O)OC,0.226,22.59231484,O,0,0,O,0,1.508,1.468
5
+ COCCOC,0.434,43.4423376,COCCOCCOCCOCCOC,0.205,20.47449683,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.361,36.08316557,O,0,0,O,0,0,O,0,1.882,1.71
6
+ C1C(OC(=O)O1)F,0.187,18.72872664,COC(=O)OC,0.162,16.22691423,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.109,10.92850826,FC(F)C(F)(F)COC(F)(F)C(F)F,0.541,54.11585087,O,0,0,O,0,2.103,1.832
7
+ C1COC(=O)O1,0.134,13.35070843,C1C(OC(=O)O1)F,0.122,12.2111419,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.107,10.72028474,COC(=O)OC,0.106,10.57995858,FC(F)C(F)(F)COC(F)(F)C(F)F,0.531,53.13790635,O,0,2.077,2.104
8
+ COCCOC,0.096,9.614613177,COCCOCCOCCOCCOC,0.045,4.53139444,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.12,12.01491409,C1COCO1,0.143,14.28400162,FC(F)C(F)(F)COC(F)(F)C(F)F,0.596,59.55507668,O,0,2.211,2.274
9
+ C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].F[P-](F)(F)(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.17,1.071
10
+ C1COC(=O)O1,0.519,51.92400559,COC(=O)OC,0.411,41.14791596,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,6.928078454,O,0,0,O,0,0,O,0,1.077,1.166
11
+ C1COC(=O)O1,0.519,51.85215842,COC(=O)OC,0.411,41.09097965,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.918492083,[Li+].[N+](=O)([O-])[O-],0.001,0.138369842,O,0,0,O,0,1.19,1.335
12
+ C1COC(=O)O1,0.513,51.33049845,COC(=O)OC,0.407,40.6775828,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.069,6.9173773,C1=COC(=O)O1,0.011,1.07454145,O,0,0,O,0,1.114,1.129
13
+ COCCOC,0.53,53.00533987,COCCOCCOCCOCCOC,0.25,24.98156691,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.22,22.01309322,O,0,0,O,0,0,O,0,1.758,1.501
14
+ COCCOC,0.477,47.74974224,COCCOCCOCCOCCOC,0.225,22.50458884,[Li+].[N-](S(=O)(=O)F)S(=O)(=O)F,0.297,29.74566892,O,0,0,O,0,0,O,0,1.821,1.663
data/lce/train.csv ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
2
+ C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,1.155
3
+ C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.046
4
+ O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0.0,O,0.0,O,0.0,O,0.0,1.569
5
+ C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0.0,O,0.0,O,0.0,0.886
6
+ COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0.0,O,0.0,O,0.0,O,0.0,1.367
7
+ COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,2.301
8
+ C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0.0,O,0.0,O,0.0,O,0.0,1.489
9
+ COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0.0,O,0.0,O,0.0,1.244
10
+ C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0.0,O,0.0,0.745
11
+ COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0.0,O,0.0,O,0.0,1.292
12
+ CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0.0,O,0.0,O,0.0,2.301
13
+ O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,O,0.0,1.745
14
+ COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,1.745
15
+ C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0.0,O,0.0,1.076
16
+ C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.854
17
+ C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0.0,1.678
18
+ FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
19
+ CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
20
+ COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0.0,O,0.0,O,0.0,O,0.0,1.638
21
+ CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0.0,O,0.0,O,0.0,O,0.0,2.0
22
+ COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0.0,O,0.0,O,0.0,1.854
23
+ O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0.0,O,0.0,O,0.0,1.959
24
+ C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0.0,O,0.0,O,0.0,O,0.0,1.587
25
+ CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0.0,O,0.0,O,0.0,0.699
26
+ C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0.0,O,0.0,2.097
27
+ C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0.0,O,0.0,O,0.0,1.59
28
+ C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
29
+ COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,1.337
30
+ C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0.0,O,0.0,O,0.0,O,0.0,1.377
31
+ C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0.0,O,0.0,O,0.0,O,0.0,1.544
32
+ CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0.0,O,0.0,O,0.0,O,0.0,2.097
33
+ COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0.0,O,0.0,O,0.0,1.215
34
+ COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0.0,O,0.0,O,0.0,O,0.0,1.222
35
+ C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0.0,O,0.0,1.194
36
+ O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0.0,O,0.0,1.824
37
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.333
38
+ O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0.0,O,0.0,1.824
39
+ COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0.0,O,0.0,O,0.0,2.051
40
+ COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0.0,O,0.0,O,0.0,1.444
41
+ O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0.0,O,0.0,1.854
42
+ CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0.0,O,0.0,O,0.0,O,0.0,2.046
43
+ C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0.0,O,0.0,1.301
44
+ C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,,O,0.0,O,0.0,O,0.0,2.222
45
+ O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0.0,O,0.0,O,0.0,1.903
46
+ COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.561
47
+ C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0.0,O,0.0,1.735
48
+ FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0.0,O,0.0,O,0.0,O,0.0,2.301
49
+ C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.498
50
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0.0,O,0.0,0.745
51
+ O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0.0,O,0.0,O,0.0,1.824
52
+ CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,O,0.0,2.0
53
+ O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0.0,O,0.0,O,0.0,1.456
54
+ COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0.0,O,0.0,O,0.0,O,0.0,1.301
55
+ COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0.0,O,0.0,O,0.0,1.678
56
+ C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0.0,O,0.0,O,0.0,O,0.0,1.646
57
+ O1CCOC1,0.397,COCCOC,0.589,[Li+][S-]SSS[S-][Li+],,[Li+].[N+](=O)([O-])[O-],0.012,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.002,O,0.0,1.301
58
+ C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0.0,O,0.0,O,0.0,2.046
59
+ C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0.0,O,0.0,O,0.0,0.788
60
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.373
61
+ O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0.0,O,0.0,O,0.0,O,0.0,1.602
62
+ CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,1.854
63
+ COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0.0,O,0.0,O,0.0,O,0.0,2.097
64
+ O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0.0,O,0.0,O,0.0,O,0.0,1.699
65
+ FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.155
66
+ CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0.0,O,0.0,O,0.0,O,0.0,2.208
67
+ COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,1.77
68
+ CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0.0,O,0.0,O,0.0,O,0.0,0.824
69
+ C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0.0,O,0.0,0.924
70
+ CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0.0,O,0.0,O,0.0,2.097
71
+ COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0.0,O,0.0,O,0.0,O,0.0,2.108
72
+ CC1COC(=O)O1,0.922,[LI+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0.0,O,0.0,O,0.0,O,0.0,0.712
73
+ C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0.0,O,0.0,1.081
74
+ C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0.0,O,0.0,1.319
75
+ COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0.0,O,0.0,O,0.0,1.62
76
+ C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0.0,O,0.0,2.222
77
+ C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,O,0.0,0.777
78
+ CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0.0,O,0.0,O,0.0,O,0.0,2.018
79
+ COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0.0,O,0.0,O,0.0,O,0.0,1.886
80
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0.0,O,0.0,0.699
81
+ CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0.0,O,0.0,O,0.0,O,0.0,1.569
82
+ C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0.0,O,0.0,O,0.0,1.523
83
+ COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.155
84
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.488
85
+ O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0.0,2.046
86
+ C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0.0,O,0.0,O,0.0,O,0.0,1.41
87
+ COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0.0,O,0.0,O,0.0,2.222
88
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0.0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.559
89
+ COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.301
90
+ CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,O,0.0,1.672
91
+ C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
92
+ CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0.0,O,0.0,O,0.0,O,0.0,1.796
93
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.355
94
+ C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0.0,O,0.0,O,0.0,1.523
95
+ COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0.0,O,0.0,1.78
96
+ O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0.0,O,0.0,O,0.0,O,0.0,1.456
97
+ O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0.0,O,0.0,O,0.0,O,0.0,1.745
98
+ O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0.0,O,0.0,O,0.0,1.967
99
+ COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0.0,O,0.0,O,0.0,O,0.0,2.097
100
+ COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0.0,O,0.0,O,0.0,O,0.0,1.143
101
+ O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0.0,O,0.0,1.523
102
+ COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0.0,O,0.0,O,0.0,2.301
103
+ CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0.0,O,0.0,O,0.0,O,0.0,2.155
104
+ C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,1.301
105
+ COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,2.222
106
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0.0,O,0.0,0.699
107
+ COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0.0,O,0.0,O,0.0,1.495
108
+ C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0.0,O,0.0,O,0.0,2.155
109
+ C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0.0,O,0.0,1.921
110
+ COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0.0,O,0.0,1.886
111
+ CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0.0,O,0.0,O,0.0,2.046
112
+ COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0.0,O,0.0,O,0.0,O,0.0,1.745
113
+ C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0.0,O,0.0,O,0.0,O,0.0,1.633
114
+ C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0.0,O,0.0,O,0.0,2.097
115
+ FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0.0,O,0.0,O,0.0,O,0.0,2.301
116
+ C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0.0,O,0.0,1.108
117
+ C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0.0,O,0.0,O,0.0,1.62
118
+ CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0.0,O,0.0,O,0.0,O,0.0,1.959
119
+ C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0.0,O,0.0,O,0.0,1.013
120
+ C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0.0,1.824
121
+ O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0.0,O,0.0,O,0.0,1.921
data/lce/train_data.csv ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smiles1,conc1,smiles2,conc2,smiles3,conc3,smiles4,conc4,smiles5,conc5,smiles6,conc6,LCE
2
+ CC1COC(=O)O1,0.875,C1C(OC(=O)O1)F,0.051,[Li+].[O-]Cl(=O)(=O)=O,0.074,O,0,O,0,O,0,0.699
3
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[P-](F)(F)(F)(F)F,0.069,O,0,O,0,0.699
4
+ FC(F)COCCOCC(F)(F),0.845,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.155,O,0,O,0,O,0,O,0,2.301
5
+ FC(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.301
6
+ CN(C)C(=O)C(F)(F)F,0.362,C1C(OC(=O)O1)F,0.556,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.081,O,0,O,0,O,0,2.155
7
+ COCCOC,0.231,FC1CCCCC1,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
8
+ CCOP(=O)(OCC)OCC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,2.155
9
+ O1CCOC1,0.362,C(C(F)(F)F)OCC(F)(F)F,0.59,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.048,O,0,O,0,O,0,1.967
10
+ COCC(F)(F)C(F)(F)COC,0.864,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.136,O,0,O,0,O,0,O,0,1.991
11
+ C1C(OC(=O)O1)F,0.662,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.338,O,0,O,0,O,0,O,0,1.646
12
+ COCCOC,0.358,O1CCOC1,0.532,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.074,[Li+].[N+](=O)([O-])[O-],0.035,O,0,O,0,1.658
13
+ CN(C)S(=O)(=O)F,0.921,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,O,0,1.672
14
+ C1C(OC(=O)O1)F,0.106,C1COC(=O)O1,0.522,O=C(OCC)OCC,0.287,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.004,O1CCOCCOCCOCCOCCOCC1,0.004,1.252
15
+ C1COC(=O)O1,0.32,COC(=O)OC,0.253,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.427,O,0,O,0,O,0,2.155
16
+ COCCOC,0.277,FC(F)C(F)(F)COC(F)(F)C(F)F,0.555,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.168,O,0,O,0,O,0,2.155
17
+ COC(=O)OC,0.161,FC(F)C(F)(F)COC(F)(F)C(F)F,0.355,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.484,O,0,O,0,O,0,2.155
18
+ FC(F)(F)COCCOCC,0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
19
+ FC(F)(F)COCCOCC(F)(F)(F),0.838,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.162,O,0,O,0,O,0,O,0,2.155
20
+ CCOCC,0.64,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.36,O,0,O,0,O,0,O,0,2.208
21
+ C1C(OC(=O)O1)F,0.144,COC(=O)OCCF,0.173,C(C(F)(F)F)OC(C(F)F)(F)F,0.548,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.135,O,0,O,0,2.222
22
+ CC#N,0.882,FC,0.065,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.054,O,0,O,0,O,0,2.222
23
+ C1CCOC1,0.942,FC,0.029,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.029,O,0,O,0,O,0,2.222
24
+ COCCOC,0.139,COCC(F)(F)C(F)(F)C(F)(F)C(F)(F)COC,0.692,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.169,O,0,O,0,O,0,2.222
25
+ COCCOC,0.231,C(C(F)(F)F)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.222
26
+ COCCOC,0.507,COC(C(F)(F)F)C(F)(F)F,0.399,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.095,O,0,O,0,O,0,2.268
27
+ CCOCC,0.313,C(C(F)(F)F)OCC(F)(F)F,0.51,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.177,O,0,O,0,O,0,2.301
28
+ COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,2.301
29
+ COCCOC,0.242,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.604,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.154,O,0,O,0,O,0,2.301
30
+ O1C(C)CCC1,0.331,FC(F)C(F)(F)COC(F)(F)C(F)F,0.498,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.171,O,0,O,0,O,0,2.301
31
+ COCCOC,0.2,FC(F)C(F)(F)COC(F)(F)C(F)F,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,2.301
32
+ COCCOC,0.231,FC(COC(OCC(F)(F)F)OCC(F)(F)F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.301
33
+ O=S1(=O)CCCC1,0.359,C(C(F)(F)F)OC(C(F)F)(F)F,0.504,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.133,[Li+].[N+](=O)([O-])[O-],0.004,O,0,O,0,2
34
+ CCOCC,0.856,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,O,0,2
35
+ CCOCC,0.877,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,O,0,O,0,O,0,O,0,2.018
36
+ CCOCC,0.707,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.293,O,0,O,0,O,0,O,0,2.046
37
+ C1COC(=O)O1,0.308,O=C(OCC)OCC(F)(F)F,0.349,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.343,O,0,O,0,O,0,2.046
38
+ O1CCOC1,0.453,COCCOC,0.305,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.063,[Li+].[N+](=O)([O-])[O-],0.051,O,0,2.046
39
+ CCOP(=O)(OCC)OCC,0.259,FC(F)C(F)(F)COC(F)(F)C(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,2.046
40
+ COCCOC,0.257,C(C(F)(F)F)OCC(F)(F)F,0.508,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.235,O,0,O,0,O,0,2.051
41
+ COC(=O)OC,0.299,C(C(F)(F)F)OCC(F)(F)F,0.598,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.103,O,0,O,0,O,0,2.056
42
+ CCOP(=O)(OCC)OCC,0.214,C(C(F)(F)F)OCC(F)(F)F,0.642,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.144,O,0,O,0,O,0,2.097
43
+ COC(=O)OC,0.684,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.316,O,0,O,0,O,0,O,0,2.097
44
+ C1CCOC1,0.47,O1C(C)CCC1,0.378,[Li+].F[P-](F)(F)(F)(F)F,0.152,O,0,O,0,O,0,2.097
45
+ C1C(OC(=O)O1)F,0.264,COC(=O)OCCF,0.479,C(C(F)(F)F)OC(C(F)F)(F)F,0.155,[Li+].F[P-](F)(F)(F)(F)F,0.103,O,0,O,0,2.097
46
+ CCOP(=O)(OCC)OCC,0.5,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.5,O,0,O,0,O,0,O,0,2.097
47
+ COC(=O)OC,0.543,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.457,O,0,O,0,O,0,O,0,2.097
48
+ COCCOC,0.682,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.318,O,0,O,0,O,0,O,0,2.108
49
+ COCCOC,0.231,FC(F)C(F)(F)COC(F)(F)C(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,2.155
50
+ CCOP(=O)(OCC)OCC,0.728,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.272,O,0,O,0,O,0,O,0,2
51
+ COCCOC,0.583,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.278,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.139,O,0,O,0,O,0,1.678
52
+ C1COC(=O)O1,0.305,COC(=O)OC,0.242,COCCOCCOCCOCCOC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.041,[Li+].[N+](=O)([O-])[O-],0.02,O,0,1.678
53
+ C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,B(O[Si](C)(C)C)(O[Si](C)(C)C)O[Si](C)(C),0.083,[Li+].F[P-](F)(F)(F)(F)F,0.001,O,0,1.678
54
+ O=S1(=O)CCCC1,0.714,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.286,O,0,O,0,O,0,O,0,1.699
55
+ C1C(OC(=O)O1)F,0.149,COC(=O)OCCF,0.178,C(C(F)(F)F)OC(C(F)F)(F)F,0.564,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.108,O,0,O,0,1.735
56
+ O=S1(=O)CCCC1,0.75,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,O,0,1.745
57
+ COC(=O)OC,0.29,C(C(F)(F)F)OCC(F)(F)F,0.589,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,1.745
58
+ COCCOC,0.677,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,O,0,1.745
59
+ O1C(C)CCC1,0.908,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.092,O,0,O,0,O,0,O,0,1.745
60
+ COC(=O)OC,0.6,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.4,O,0,O,0,O,0,O,0,1.77
61
+ COCCOC,0.371,O1CCOC1,0.552,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.031,[Li+].[N+](=O)([O-])[O-],0.046,O,0,O,0,1.78
62
+ CC1CCC(C)O1,0.893,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.107,O,0,O,0,O,0,O,0,1.796
63
+ C1COC(=O)O1,0.05,CCOC(=O)OC,0.237,C(C(F)(F)F)OCC(F)(F)F,0.575,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.123,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.015,O,0,1.824
64
+ O=S1(=O)CCCC1,0.758,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.235,[Li+].[N+](=O)([O-])[O-],0.007,O,0,O,0,O,0,1.824
65
+ O1CCOC1,0.463,COCCOC,0.312,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.194,[Li+].[N+](=O)([O-])[O-],0.03,O,0,O,0,1.824
66
+ O1CCOC1,0.539,COCCOC,0.363,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.075,[Li+].[N+](=O)([O-])[O-],0.023,O,0,O,0,1.824
67
+ COC(=O)OC,0.375,FC(F)C(F)(F)COC(F)(F)C(F)F,0.375,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.25,O,0,O,0,O,0,1.854
68
+ O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.134,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.067,O,0,O,0,1.854
69
+ CC#N,0.621,C1=COC(=O)O1,0.056,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.323,O,0,O,0,O,0,1.854
70
+ COC(=O)OC,0.478,FC(F)C(F)(F)COC(F)(F)C(F)F,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.067,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.134,O,0,O,0,1.886
71
+ COC(=O)OC,0.664,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.336,O,0,O,0,O,0,O,0,1.886
72
+ O1CCOC1,0.478,COCCOC,0.322,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.2,O,0,O,0,O,0,1.903
73
+ O=S1(=O)CCCC1,0.429,FC(F)C(F)(F)COC(F)(F)C(F)F,0.429,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.143,O,0,O,0,O,0,1.921
74
+ C1C(OC(=O)O1)F,0.312,O=C1OCCC1,0.599,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,[Li+].[N+](=O)([O-])[O-],0.021,O,0,O,0,1.921
75
+ CC1COC(=O)O1,0.595,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.405,O,0,O,0,O,0,O,0,1.921
76
+ O1CCOC1,0.371,COCCOC,0.552,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.077,O,0,O,0,O,0,1.959
77
+ CCOCC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.959
78
+ C1CCOC1,0.925,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.075,O,0,O,0,O,0,O,0,1.377
79
+ C1COC(=O)O1,0.425,O=C(OCC)OCC,0.234,[Li+].F[P-](F)(F)(F)(F)F,0.34,O,0,O,0,O,0,1.398
80
+ C1C(OC(=O)O1)F,0.932,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.068,O,0,O,0,O,0,O,0,1.41
81
+ COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.047,[Li+].FP(F)(=O)([O-]),0.047,O,0,O,0,O,0,1.444
82
+ O=S1(=O)CCCC1,0.764,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.236,O,0,O,0,O,0,O,0,1.456
83
+ O=C(OCC)C,0.105,ClCCl,0.64,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.255,O,0,O,0,O,0,1.456
84
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.488
85
+ C1C(OC(=O)O1)F,0.873,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.127,O,0,O,0,O,0,O,0,1.489
86
+ COCCOC,0.231,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.577,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.192,O,0,O,0,O,0,1.495
87
+ C1C(OC(=O)O1)F,0.495,COC(=O)OC,0.429,O1CCOCCOCCOCC1,0.003,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.498
88
+ C1C(OC(=O)O1)F,0.481,O=C(OCC)OCC,0.432,[Li+].F[P-](F)(F)(F)(F)F,0.087,O,0,O,0,O,0,1.523
89
+ O1CCOC1,0.322,COCCOC,0.478,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.2,O,0,O,0,O,0,1.523
90
+ O1CCOC1,0.552,COCCOC,0.371,[Li+].[N+](=O)([O-])[O-],0.039,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,O,0,O,0,1.523
91
+ C1COC(=O)O1,0.444,C1COS(=O)O1,0.497,[Li+].[O-]Cl(=O)(=O)=O,0.059,O,0,O,0,O,0,1.523
92
+ C1C(OC(=O)O1)F,0.105,C1COC(=O)O1,0.518,O=C(OCC)OCC,0.285,[Li+].F[P-](F)(F)(F)(F)F,0.077,[Rb+].[O-][N+]([O-])=O,0.008,O1CCOCCOCCOCCOCCOCC1,0.008,1.538
93
+ C1C(OC(=O)O1)F,0.82,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.18,O,0,O,0,O,0,O,0,1.544
94
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.559
95
+ COCCOC,0.906,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,O,0,1.561
96
+ CCOP(=O)(OCC)OCC,0.648,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.352,O,0,O,0,O,0,O,0,1.569
97
+ O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0,O,0,O,0,O,0,1.569
98
+ C1C(OC(=O)O1)F,0.774,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.226,O,0,O,0,O,0,O,0,1.587
99
+ C1C(OC(=O)O1)F,0.413,O=C(OCC)OCC,0.497,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.09,O,0,O,0,O,0,1.59
100
+ C1COC(=O)O1,0.425,O=C(OCC)OCC(F)(F)F,0.481,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.094,O,0,O,0,O,0,1.602
101
+ CC1COC(=O)O1,0.702,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.298,O,0,O,0,O,0,O,0,1.602
102
+ O1CCOCC1,0.912,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.088,O,0,O,0,O,0,O,0,1.602
103
+ C1C(OC(=O)O1)F,0.62,C(C(F)(F)F)OC(=O)OCC(F)(F)F,0.291,[Li+].F[P-](F)(F)(F)(F)F,0.089,O,0,O,0,O,0,1.62
104
+ COC(=O)CC(F)(F)F,0.768,C1C(OC(=O)O1)F,0.134,[Li+].F[P-](F)(F)(F)(F)F,0.098,O,0,O,0,O,0,1.62
105
+ C1C(OC(=O)O1)F,0.733,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.267,O,0,O,0,O,0,O,0,1.629
106
+ C1C(OC(=O)O1)F,0.696,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.304,O,0,O,0,O,0,O,0,1.633
107
+ COC(C)C(C)OC,0.879,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.121,O,0,O,0,O,0,O,0,1.638
108
+ C1COC(=O)O1,0.197,COC(=O)OC,0.156,COCCOCCOCCOCCOC,0.59,[Li+].F[P-](F)(F)(F)(F)F,0.026,[Li+].[N+](=O)([O-])[O-],0.031,O,0,1.638
109
+ C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.26
110
+ COCCOC,0.707,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.147,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.147,O,0,O,0,O,0,1.268
111
+ C1COC(=O)O1,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.276
112
+ COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.174,[Li+].[O-]P(=O)(F)F,0.063,O,0,O,0,O,0,1.292
113
+ C1COC(=O)O1,0.563,O=C(OCC)OCC,0.31,C1C(OC(=O)O1)F,0.052,[Li+].F[P-](F)(F)(F)(F)F,0.075,O,0,O,0,1.301
114
+ COCCOCCOCC(F)(F)OC(F)(F)OC(F)(F)COCCOCCOC,0.708,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.292,O,0,O,0,O,0,O,0,1.301
115
+ C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].[B-]1(OC(=O)C(=O)O1)(F)F,0.092,O,0,O,0,O,0,1.301
116
+ C1C(OC(=O)O1)F,0.318,CCOC(=O)OC,0.504,COC(=O)OC,0.094,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,1.301
117
+ C1COC(=O)O1,0.519,O=C(OCC)OCC,0.387,[Li+].F[P-](F)(F)(F)(F)F,0.082,[Li+].[O-]P(=O)(F)F,0.012,O,0,O,0,1.319
118
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.333
119
+ COCCOC,0.259,B(OCC(F)(F)F)(OCC(F)(F)F)OCC(F)(F)F,0.556,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.185,O,0,O,0,O,0,1.337
120
+ C1C(OC(=O)O1)F,0.496,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.002,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.355
121
+ COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0,O,0,O,0,O,0,1.367
122
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0.001,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.373
123
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].F[B-](F)(F)F,0.069,O,0,O,0,0.699
124
+ CC1COC(=O)O1,0.922,[Li+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F),0.078,O,0,O,0,O,0,O,0,0.712
125
+ C1COC(=O)O1,0.3,CCOC(=O)OC,0.593,C1=COC(=O)O1,0.026,[Li+].F[P-](F)(F)(F)(F)F,0.081,O,0,O,0,0.745
126
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.069,O,0,O,0,0.745
127
+ C1COC(=O)O1,0.326,COC(=O)OC,0.602,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.777
128
+ C1COC(=O)O1,0.362,O=C(OCC)OCC,0.548,[Li+].F[P-](F)(F)(F)(F)F,0.09,O,0,O,0,O,0,0.788
129
+ CC1COC(=O)O1,0.887,[Li+].F[As-](F)(F)(F)(F)F,0.113,O,0,O,0,O,0,O,0,0.824
130
+ C1COC(=O)O1,0.507,COC(=O)OC,0.402,C1=COC(=O)O1,0.022,[Li+].C(C(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(C(F)(F)F)(F)F)(F)(F)F,0.069,O,0,O,0,0.854
131
+ C1COC(=O)O1,0.359,COC(=O)OC,0.569,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,0.854
132
+ C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0,O,0,O,0,0.886
133
+ C1COC(=O)O1,0.594,O=C(OCC)OCC,0.327,[Li+].F[P-](F)(F)(F)(F)F,0.079,O,0,O,0,O,0,0.921
134
+ C1COC(=O)O1,0.5,CCOC(=O)OC,0.423,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.046,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.031,O,0,O,0,0.924
135
+ C1COC(=O)O1,0.526,O=C(OCC)OCC,0.392,[Li+].F[P-](F)(F)(F)(F)F,0.083,O,0,O,0,O,0,1.013
136
+ C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.046
137
+ C1COC(=O)O1,0.682,CCOC(=O)OC,0.247,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.043,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.028,O,0,O,0,1.076
138
+ C1COC(=O)O1,0.854,CCOC(=O)OC,0.08,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.039,[Li+].O=C1O[B-]2(OC1=O)OC(=O)C(=O)O2,0.026,O,0,O,0,1.081
139
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.431,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,O,0,1.085
140
+ C1C(OC(=O)O1)F,0.107,C1COC(=O)O1,0.526,O=C(OCC)OCC,0.289,[Li+].F[P-](F)(F)(F)(F)F,0.078,O,0,O,0,1.108
141
+ C1COC(=O)O1,0.496,COC(=O)OC,0.393,C1C(OC(=O)O1)F,0.045,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.066,O,0,O,0,1.108
142
+ COCCOC,0.73,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.27,O,0,O,0,O,0,O,0,1.143
143
+ C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0,O,0,O,0,1.155
144
+ C1COC(=O)O1,0.338,COC(=O)OC,0.625,[Li+].[O-]P(=O)(F)F,0.008,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.03,O,0,O,0,1.194
145
+ COCCOC,0.731,[Li+].[O-]P(=O)(F)F,0.064,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.205,O,0,O,0,O,0,1.215
146
+ COCCOCCOCCOCCOC,0.819,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.181,O,0,O,0,O,0,O,0,1.222
147
+ C1C(OC(=O)O1)F,0.497,COC(=O)OC,0.43,O1CCOCCOCCOCC1,0,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0,O,0,1.225
148
+ COCCOC,0.706,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.008,[Li+].[O-]P(=O)(F)F,0.286,O,0,O,0,O,0,1.244
models/.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.csv filter=lfs diff=lfs merge=lfs -text
2
+ *.png filter=lfs diff=lfs merge=lfs -text
3
+ *.pdf filter=lfs diff=lfs merge=lfs -text
models/fm4m.py CHANGED
@@ -25,9 +25,17 @@ from sklearn.preprocessing import MinMaxScaler
25
  import torch
26
  from transformers import AutoTokenizer, AutoModel
27
 
28
- from .selfies_model.load import SELFIES as bart
29
- from .mhg_model import load as mhg
30
- from .smi_ted.smi_ted_light.load import load_smi_ted
 
 
 
 
 
 
 
 
31
 
32
  datasets = {}
33
  models = {}
@@ -48,7 +56,7 @@ def avail_models_data():
48
 
49
 
50
  models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
51
- {"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"},
52
  {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
53
  {"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
54
 
@@ -58,8 +66,10 @@ def avail_models(raw=False):
58
 
59
  models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
60
  {"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
61
- {"Name": "mol-xl","Model Name": "Molformer", "Description": "MolFormer model for string based SMILES modality"},
62
  {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
 
 
63
  ]
64
 
65
 
@@ -70,12 +80,22 @@ def avail_models(raw=False):
70
 
71
  return models
72
 
73
- def avail_downstream_models():
74
  global downstream_models
75
 
76
- with open("downstream_models.json", "r") as outfile:
77
- downstream_models = json.load(outfile)
78
- return downstream_models
 
 
 
 
 
 
 
 
 
 
79
 
80
  def avail_datasets():
81
  global datasets
@@ -178,13 +198,15 @@ def update_downstream_model_list(list_model):
178
 
179
  avail_models_data()
180
 
 
 
181
  def get_representation(train_data,test_data,model_type, return_tensor=True):
182
  alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
183
  if model_type in alias.keys():
184
  model_type = alias[model_type]
185
 
186
  if model_type == "mhg":
187
- model = mhg.load("models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
188
  with torch.no_grad():
189
  train_emb = model.encode(train_data)
190
  x_batch = torch.stack(train_emb)
@@ -196,7 +218,6 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
196
  x_batch_test = pd.DataFrame(x_batch_test)
197
 
198
 
199
-
200
  elif model_type == "bart":
201
  model = bart()
202
  model.load()
@@ -204,7 +225,7 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
204
  x_batch_test = model.encode(test_data, return_tensor=return_tensor)
205
 
206
  elif model_type == "smi-ted":
207
- model = load_smi_ted(folder='./models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt')
208
  with torch.no_grad():
209
  x_batch = model.encode(train_data, return_torch=return_tensor)
210
  x_batch_test = model.encode(test_data, return_torch=return_tensor)
@@ -237,35 +258,78 @@ def get_representation(train_data,test_data,model_type, return_tensor=True):
237
  if not return_tensor:
238
  x_batch = pd.DataFrame(x_batch)
239
  x_batch_test = pd.DataFrame(x_batch_test)
240
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  return x_batch, x_batch_test
243
 
244
- def single_modal(model,dataset, downstream_model,params):
245
  print(model)
246
- alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED": "smi-ted"}
247
  data = avail_models(raw=True)
248
  df = pd.DataFrame(data)
249
- print(list(df["Name"].values))
250
- if alias[model] in list(df["Name"].values):
251
- if model in alias.keys():
 
 
252
  model_type = alias[model]
253
- else:
254
- model_type = model
255
  else:
256
  print("Model not available")
257
  return
 
258
 
259
  data = avail_datasets()
260
  df = pd.DataFrame(data)
261
- print(list(df["Dataset"].values))
262
 
263
  if dataset in list(df["Dataset"].values):
264
  task = dataset
265
- with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1:
266
  x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
267
  print(f" Representation loaded successfully")
268
- else:
 
269
 
270
  print("Custom Dataset")
271
  #return
@@ -283,14 +347,40 @@ def single_modal(model,dataset, downstream_model,params):
283
 
284
  print(f" Representation loaded successfully")
285
 
 
286
 
287
-
288
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
  print(f" Calculating ROC AUC Score ...")
291
 
292
  if downstream_model == "XGBClassifier":
293
- xgb_predict_concat = XGBClassifier(**params) # n_estimators=5000, learning_rate=0.01, max_depth=10
 
 
 
294
  xgb_predict_concat.fit(x_batch, y_batch)
295
 
296
  y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
@@ -300,21 +390,26 @@ def single_modal(model,dataset, downstream_model,params):
300
  print(f"ROC-AUC Score: {roc_auc:.4f}")
301
 
302
  try:
303
- with open(f"./plot_emb/{task}_{model_type}.pkl", "rb") as f1:
304
  class_0,class_1 = pickle.load(f1)
305
  except:
306
  print("Generating latent plots")
307
  reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
308
  verbose=False)
309
  n_samples = np.minimum(1000, len(x_batch))
310
- features_umap = reducer.fit_transform(x_batch[:n_samples])
311
  try:x = y_batch.values[:n_samples]
312
  except: x = y_batch[:n_samples]
313
  index_0 = [index for index in range(len(x)) if x[index] == 0]
314
  index_1 = [index for index in range(len(x)) if x[index] == 1]
315
 
316
- class_0 = features_umap[index_0]
317
- class_1 = features_umap[index_1]
 
 
 
 
 
318
  print("Generating latent plots : Done")
319
 
320
  #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
@@ -334,20 +429,29 @@ def single_modal(model,dataset, downstream_model,params):
334
  print(f"ROC-AUC Score: {roc_auc:.4f}")
335
 
336
  try:
337
- with open(f"./plot_emb/{task}_{model_type}.pkl", "rb") as f1:
338
  class_0,class_1 = pickle.load(f1)
339
  except:
340
  print("Generating latent plots")
341
  reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
342
  n_samples = np.minimum(1000,len(x_batch))
343
- features_umap = reducer.fit_transform(x_batch[:n_samples])
344
- try:x = y_batch.values[:n_samples]
345
- except:x = y_batch[:n_samples]
346
- index_0 = [index for index in range(len(x)) if x[index] == 0]
347
- index_1 = [index for index in range(len(x)) if x[index] == 1]
348
 
349
- class_0 = features_umap[index_0]
350
- class_1 = features_umap[index_1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  print("Generating latent plots : Done")
352
 
353
  #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
@@ -355,16 +459,19 @@ def single_modal(model,dataset, downstream_model,params):
355
  result = f"ROC-AUC Score: {roc_auc:.4f}"
356
 
357
  return result, roc_auc,fpr, tpr, class_0, class_1
358
-
359
  elif downstream_model == "SVR":
360
- regressor = SVR(**params)
 
 
 
361
  model = TransformedTargetRegressor(regressor= regressor,
362
  transformer = MinMaxScaler(feature_range=(-1, 1))
363
  ).fit(x_batch,y_batch)
364
-
365
  y_prob = model.predict(x_batch_test)
366
  RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
367
-
368
  print(f"RMSE Score: {RMSE_score:.4f}")
369
  result = f"RMSE Score: {RMSE_score:.4f}"
370
 
@@ -372,20 +479,28 @@ def single_modal(model,dataset, downstream_model,params):
372
  reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
373
  verbose=False)
374
  n_samples = np.minimum(1000, len(x_batch))
375
- features_umap = reducer.fit_transform(x_batch[:n_samples])
376
- try:x = y_batch.values[:n_samples]
377
- except:x = y_batch[:n_samples]
378
  #index_0 = [index for index in range(len(x)) if x[index] == 0]
379
  #index_1 = [index for index in range(len(x)) if x[index] == 1]
380
 
381
- class_0 = features_umap#[index_0]
382
- class_1 = features_umap#[index_1]
 
 
 
 
 
383
  print("Generating latent plots : Done")
384
-
385
  return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
386
 
387
  elif downstream_model == "Kernel Ridge":
388
- regressor = KernelRidge(**params)
 
 
 
389
  model = TransformedTargetRegressor(regressor=regressor,
390
  transformer=MinMaxScaler(feature_range=(-1, 1))
391
  ).fit(x_batch, y_batch)
@@ -401,8 +516,8 @@ def single_modal(model,dataset, downstream_model,params):
401
  verbose=False)
402
  n_samples = np.minimum(1000, len(x_batch))
403
  features_umap = reducer.fit_transform(x_batch[:n_samples])
404
- try:x = y_batch.values[:n_samples]
405
- except:x = y_batch[:n_samples]
406
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
407
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
408
 
@@ -414,7 +529,10 @@ def single_modal(model,dataset, downstream_model,params):
414
 
415
 
416
  elif downstream_model == "Linear Regression":
417
- regressor = LinearRegression(**params)
 
 
 
418
  model = TransformedTargetRegressor(regressor=regressor,
419
  transformer=MinMaxScaler(feature_range=(-1, 1))
420
  ).fit(x_batch, y_batch)
@@ -431,7 +549,7 @@ def single_modal(model,dataset, downstream_model,params):
431
  n_samples = np.minimum(1000, len(x_batch))
432
  features_umap = reducer.fit_transform(x_batch[:n_samples])
433
  try:x = y_batch.values[:n_samples]
434
- except:x = y_batch[:n_samples]
435
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
436
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
437
 
@@ -460,7 +578,7 @@ def single_modal(model,dataset, downstream_model,params):
460
  n_samples = np.minimum(1000, len(x_batch))
461
  features_umap = reducer.fit_transform(x_batch[:n_samples])
462
  try:x = y_batch.values[:n_samples]
463
- except:x = y_batch[:n_samples]
464
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
465
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
466
 
@@ -469,10 +587,10 @@ def single_modal(model,dataset, downstream_model,params):
469
  print("Generating latent plots : Done")
470
 
471
  return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
 
472
 
473
-
474
- def multi_modal(model_list,dataset, downstream_model,params):
475
- print(model_list)
476
  data = avail_datasets()
477
  df = pd.DataFrame(data)
478
  list(df["Dataset"].values)
@@ -480,7 +598,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
480
  if dataset in list(df["Dataset"].values):
481
  task = dataset
482
  predefined = True
483
- else:
484
  predefined = False
485
  components = dataset.split(",")
486
  train_data = pd.read_csv(components[0])[components[2]]
@@ -490,13 +608,18 @@ def multi_modal(model_list,dataset, downstream_model,params):
490
  y_batch_test = pd.read_csv(components[1])[components[3]]
491
 
492
  print("Custom Dataset loaded")
493
-
 
 
 
 
 
494
 
495
  data = avail_models(raw=True)
496
  df = pd.DataFrame(data)
497
  list(df["Name"].values)
498
 
499
- alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "SMI-TED":"smi-ted"}
500
  #if set(model_list).issubset(list(df["Name"].values)):
501
  if set(model_list).issubset(list(alias.keys())):
502
  for i, model in enumerate(model_list):
@@ -507,7 +630,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
507
 
508
  if i == 0:
509
  if predefined:
510
- with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1:
511
  x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
512
  print(f" Loaded representation/{task}_{model_type}.pkl")
513
  else:
@@ -517,7 +640,7 @@ def multi_modal(model_list,dataset, downstream_model,params):
517
 
518
  else:
519
  if predefined:
520
- with open(f"./representation/{task}_{model_type}.pkl", "rb") as f1:
521
  x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
522
  print(f" Loaded representation/{task}_{model_type}.pkl")
523
  else:
@@ -528,7 +651,6 @@ def multi_modal(model_list,dataset, downstream_model,params):
528
  x_batch = pd.concat([x_batch, x_batch_1], axis=1)
529
  x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
530
 
531
-
532
  else:
533
  print("Model not available")
534
  return
@@ -538,11 +660,31 @@ def multi_modal(model_list,dataset, downstream_model,params):
538
 
539
  num_columns = x_batch.shape[1]
540
  x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
541
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
  print(f"Representations loaded successfully")
544
  try:
545
- with open(f"./plot_emb/{task}_multi.pkl", "rb") as f1:
546
  class_0, class_1 = pickle.load(f1)
547
  except:
548
  print("Generating latent plots")
@@ -552,8 +694,8 @@ def multi_modal(model_list,dataset, downstream_model,params):
552
  features_umap = reducer.fit_transform(x_batch[:n_samples])
553
 
554
  if "Classifier" in downstream_model:
555
- try:x = y_batch.values[:n_samples]
556
- except:x = y_batch[:n_samples]
557
  index_0 = [index for index in range(len(x)) if x[index] == 0]
558
  index_1 = [index for index in range(len(x)) if x[index] == 1]
559
 
@@ -570,7 +712,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
570
 
571
 
572
  if downstream_model == "XGBClassifier":
573
- xgb_predict_concat = XGBClassifier(**params)#n_estimators=5000, learning_rate=0.01, max_depth=10)
 
 
 
574
  xgb_predict_concat.fit(x_batch, y_batch)
575
 
576
  y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
@@ -608,21 +753,27 @@ def multi_modal(model_list,dataset, downstream_model,params):
608
  return result, roc_auc,fpr, tpr, class_0, class_1
609
 
610
  elif downstream_model == "SVR":
611
- regressor = SVR(**params)
 
 
 
612
  model = TransformedTargetRegressor(regressor= regressor,
613
  transformer = MinMaxScaler(feature_range=(-1, 1))
614
  ).fit(x_batch,y_batch)
615
-
616
  y_prob = model.predict(x_batch_test)
617
  RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
618
-
619
  print(f"RMSE Score: {RMSE_score:.4f}")
620
  result = f"RMSE Score: {RMSE_score:.4f}"
621
-
622
  return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
623
 
624
  elif downstream_model == "Linear Regression":
625
- regressor = LinearRegression(**params)
 
 
 
626
  model = TransformedTargetRegressor(regressor=regressor,
627
  transformer=MinMaxScaler(feature_range=(-1, 1))
628
  ).fit(x_batch, y_batch)
@@ -636,7 +787,10 @@ def multi_modal(model_list,dataset, downstream_model,params):
636
  return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
637
 
638
  elif downstream_model == "Kernel Ridge":
639
- regressor = KernelRidge(**params)
 
 
 
640
  model = TransformedTargetRegressor(regressor=regressor,
641
  transformer=MinMaxScaler(feature_range=(-1, 1))
642
  ).fit(x_batch, y_batch)
@@ -665,6 +819,144 @@ def multi_modal(model_list,dataset, downstream_model,params):
665
 
666
 
667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
 
669
 
670
 
 
25
  import torch
26
  from transformers import AutoTokenizer, AutoModel
27
 
28
+ import sys
29
+ sys.path.append("models/")
30
+
31
+ from models.selfies_ted.load import SELFIES as bart
32
+ from models.mhg_model import load as mhg
33
+ from models.smi_ted.smi_ted_light.load import load_smi_ted
34
+
35
+ import mordred
36
+ from mordred import Calculator, descriptors
37
+ from rdkit import Chem
38
+ from rdkit.Chem import AllChem
39
 
40
  datasets = {}
41
  models = {}
 
56
 
57
 
58
  models = [{"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality", "Timestamp": "2024-06-21 12:32:20"},
59
+ {"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality", "Timestamp": "2024-06-21 12:35:56"},
60
  {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model", "Timestamp": "2024-07-10 00:09:42"},
61
  {"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model", "Timestamp": "2024-07-10 00:09:42"}]
62
 
 
66
 
67
  models = [{"Name": "smi-ted", "Model Name": "SMI-TED","Description": "SMILES based encoder decoder model"},
68
  {"Name": "bart","Model Name": "SELFIES-TED","Description": "BART model for string based SELFIES modality"},
69
+ {"Name": "mol-xl","Model Name": "MolFormer", "Description": "MolFormer model for string based SMILES modality"},
70
  {"Name": "mhg", "Model Name": "MHG-GED","Description": "Molecular hypergraph model"},
71
+ {"Name": "Mordred", "Model Name": "Mordred","Description": "Baseline: A descriptor-calculation software application that can calculate more than 1800 two- and three-dimensional descriptors"},
72
+ {"Name": "MorganFingerprint", "Model Name": "MorganFingerprint","Description": "Baseline: Circular atom environments based descriptor"}
73
  ]
74
 
75
 
 
80
 
81
  return models
82
 
83
+ def avail_downstream_models(raw=False):
84
  global downstream_models
85
 
86
+ downstream_models = [{"Name": "XGBClassifier", "Task Type": "Classfication"},
87
+ {"Name": "DefaultClassifier", "Task Type": "Classfication"},
88
+ {"Name": "SVR", "Task Type": "Regression"},
89
+ {"Name": "Kernel Ridge", "Task Type": "Regression"},
90
+ {"Name": "Linear Regression", "Task Type": "Regression"},
91
+ {"Name": "DefaultRegressor", "Task Type": "Regression"},
92
+ ]
93
+
94
+ if raw: return downstream_models
95
+ else:
96
+ return pd.DataFrame(downstream_models)
97
+
98
+
99
 
100
  def avail_datasets():
101
  global datasets
 
198
 
199
  avail_models_data()
200
 
201
+
202
+
203
  def get_representation(train_data,test_data,model_type, return_tensor=True):
204
  alias = {"MHG-GED": "mhg", "SELFIES-TED": "bart", "MolFormer": "mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
205
  if model_type in alias.keys():
206
  model_type = alias[model_type]
207
 
208
  if model_type == "mhg":
209
+ model = mhg.load("../models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle")
210
  with torch.no_grad():
211
  train_emb = model.encode(train_data)
212
  x_batch = torch.stack(train_emb)
 
218
  x_batch_test = pd.DataFrame(x_batch_test)
219
 
220
 
 
221
  elif model_type == "bart":
222
  model = bart()
223
  model.load()
 
225
  x_batch_test = model.encode(test_data, return_tensor=return_tensor)
226
 
227
  elif model_type == "smi-ted":
228
+ model = load_smi_ted(folder='../models/smi_ted/smi_ted_light', ckpt_filename='smi-ted-Light_40.pt')
229
  with torch.no_grad():
230
  x_batch = model.encode(train_data, return_torch=return_tensor)
231
  x_batch_test = model.encode(test_data, return_torch=return_tensor)
 
258
  if not return_tensor:
259
  x_batch = pd.DataFrame(x_batch)
260
  x_batch_test = pd.DataFrame(x_batch_test)
261
+
262
+ elif model_type == 'Mordred':
263
+ all_data = train_data + test_data
264
+ calc = Calculator(descriptors, ignore_3D=True)
265
+ mol_list = [Chem.MolFromSmiles(sm) for sm in all_data]
266
+ x_all = calc.pandas(mol_list)
267
+ print (f'original mordred fv dim: {x_all.shape}')
268
+
269
+ for j in x_all.columns:
270
+ for k in range(len(x_all[j])):
271
+ i = x_all.loc[k, j]
272
+ if type(i) is mordred.error.Missing or type(i) is mordred.error.Error:
273
+ x_all.loc[k, j] = np.nan
274
+
275
+ x_all.dropna(how="any", axis = 1, inplace=True)
276
+ print (f'Nan excluded mordred fv dim: {x_all.shape}')
277
+
278
+ x_batch = x_all.iloc[:len(train_data)]
279
+ x_batch_test = x_all.iloc[len(train_data):]
280
+ # print(f'x_batch: {len(x_batch)}, x_batch_test: {len(x_batch_test)}')
281
+
282
+ elif model_type == 'MorganFingerprint':
283
+ params = {'radius':2, 'nBits':1024}
284
+
285
+ mol_train = [Chem.MolFromSmiles(sm) for sm in train_data]
286
+ mol_test = [Chem.MolFromSmiles(sm) for sm in test_data]
287
+
288
+ x_batch = []
289
+ for mol in mol_train:
290
+ info = {}
291
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
292
+ vector = list(fp)
293
+ x_batch.append(vector)
294
+ x_batch = pd.DataFrame(x_batch)
295
+
296
+ x_batch_test = []
297
+ for mol in mol_test:
298
+ info = {}
299
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, **params, bitInfo=info)
300
+ vector = list(fp)
301
+ x_batch_test.append(vector)
302
+ x_batch_test = pd.DataFrame(x_batch_test)
303
 
304
  return x_batch, x_batch_test
305
 
306
+ def single_modal(model,dataset=None, downstream_model=None, params=None, x_train=None, x_test=None, y_train=None, y_test=None):
307
  print(model)
308
+ alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl", "SMI-TED": "smi-ted"}
309
  data = avail_models(raw=True)
310
  df = pd.DataFrame(data)
311
+ #print(list(df["Name"].values))
312
+
313
+ if model in list(df["Name"].values):
314
+ model_type = model
315
+ elif alias[model] in list(df["Name"].values):
316
  model_type = alias[model]
 
 
317
  else:
318
  print("Model not available")
319
  return
320
+
321
 
322
  data = avail_datasets()
323
  df = pd.DataFrame(data)
324
+ #print(list(df["Dataset"].values))
325
 
326
  if dataset in list(df["Dataset"].values):
327
  task = dataset
328
+ with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
329
  x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
330
  print(f" Representation loaded successfully")
331
+
332
+ elif x_train==None:
333
 
334
  print("Custom Dataset")
335
  #return
 
347
 
348
  print(f" Representation loaded successfully")
349
 
350
+ else:
351
 
352
+ y_batch = y_train
353
+ y_batch_test = y_test
354
+ x_batch, x_batch_test = get_representation(x_train, x_test, model_type)
355
+
356
+ # exclude row containing Nan value
357
+ if isinstance(x_batch, torch.Tensor):
358
+ x_batch = pd.DataFrame(x_batch)
359
+ nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
360
+ if len(nan_indices) > 0:
361
+ x_batch.dropna(inplace = True)
362
+ for index in sorted(nan_indices, reverse=True):
363
+ del y_batch[index]
364
+ print(f'x_batch Nan index: {nan_indices}')
365
+ print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
366
+
367
+ if isinstance(x_batch_test, torch.Tensor):
368
+ x_batch_test = pd.DataFrame(x_batch_test)
369
+ nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
370
+ if len(nan_indices) > 0:
371
+ x_batch_test.dropna(inplace = True)
372
+ for index in sorted(nan_indices, reverse=True):
373
+ del y_batch_test[index]
374
+ print(f'x_batch_test Nan index: {nan_indices}')
375
+ print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
376
 
377
  print(f" Calculating ROC AUC Score ...")
378
 
379
  if downstream_model == "XGBClassifier":
380
+ if params == None:
381
+ xgb_predict_concat = XGBClassifier()
382
+ else:
383
+ xgb_predict_concat = XGBClassifier(**params) # n_estimators=5000, learning_rate=0.01, max_depth=10
384
  xgb_predict_concat.fit(x_batch, y_batch)
385
 
386
  y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
 
390
  print(f"ROC-AUC Score: {roc_auc:.4f}")
391
 
392
  try:
393
+ with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
394
  class_0,class_1 = pickle.load(f1)
395
  except:
396
  print("Generating latent plots")
397
  reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
398
  verbose=False)
399
  n_samples = np.minimum(1000, len(x_batch))
400
+
401
  try:x = y_batch.values[:n_samples]
402
  except: x = y_batch[:n_samples]
403
  index_0 = [index for index in range(len(x)) if x[index] == 0]
404
  index_1 = [index for index in range(len(x)) if x[index] == 1]
405
 
406
+ try:
407
+ features_umap = reducer.fit_transform(x_batch[:n_samples])
408
+ class_0 = features_umap[index_0]
409
+ class_1 = features_umap[index_1]
410
+ except:
411
+ class_0 = []
412
+ class_1 = []
413
  print("Generating latent plots : Done")
414
 
415
  #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
 
429
  print(f"ROC-AUC Score: {roc_auc:.4f}")
430
 
431
  try:
432
+ with open(f"plot_emb/{task}_{model_type}.pkl", "rb") as f1:
433
  class_0,class_1 = pickle.load(f1)
434
  except:
435
  print("Generating latent plots")
436
  reducer = umap.UMAP(metric='euclidean', n_neighbors= 10, n_components=2, low_memory=True, min_dist=0.1, verbose=False)
437
  n_samples = np.minimum(1000,len(x_batch))
 
 
 
 
 
438
 
439
+ try:
440
+ x = y_batch.values[:n_samples]
441
+ except:
442
+ x = y_batch[:n_samples]
443
+
444
+ try:
445
+ features_umap = reducer.fit_transform(x_batch[:n_samples])
446
+ index_0 = [index for index in range(len(x)) if x[index] == 0]
447
+ index_1 = [index for index in range(len(x)) if x[index] == 1]
448
+
449
+ class_0 = features_umap[index_0]
450
+ class_1 = features_umap[index_1]
451
+ except:
452
+ class_0 = []
453
+ class_1 = []
454
+
455
  print("Generating latent plots : Done")
456
 
457
  #vizualize(roc_auc,fpr, tpr, x_batch, y_batch )
 
459
  result = f"ROC-AUC Score: {roc_auc:.4f}"
460
 
461
  return result, roc_auc,fpr, tpr, class_0, class_1
462
+
463
  elif downstream_model == "SVR":
464
+ if params == None:
465
+ regressor = SVR()
466
+ else:
467
+ regressor = SVR(**params)
468
  model = TransformedTargetRegressor(regressor= regressor,
469
  transformer = MinMaxScaler(feature_range=(-1, 1))
470
  ).fit(x_batch,y_batch)
471
+
472
  y_prob = model.predict(x_batch_test)
473
  RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
474
+
475
  print(f"RMSE Score: {RMSE_score:.4f}")
476
  result = f"RMSE Score: {RMSE_score:.4f}"
477
 
 
479
  reducer = umap.UMAP(metric='euclidean', n_neighbors=10, n_components=2, low_memory=True, min_dist=0.1,
480
  verbose=False)
481
  n_samples = np.minimum(1000, len(x_batch))
482
+
483
+ try: x = y_batch.values[:n_samples]
484
+ except: x = y_batch[:n_samples]
485
  #index_0 = [index for index in range(len(x)) if x[index] == 0]
486
  #index_1 = [index for index in range(len(x)) if x[index] == 1]
487
 
488
+ try:
489
+ features_umap = reducer.fit_transform(x_batch[:n_samples])
490
+ class_0 = features_umap#[index_0]
491
+ class_1 = features_umap#[index_1]
492
+ except:
493
+ class_0 = []
494
+ class_1 = []
495
  print("Generating latent plots : Done")
496
+
497
  return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
498
 
499
  elif downstream_model == "Kernel Ridge":
500
+ if params == None:
501
+ regressor = KernelRidge()
502
+ else:
503
+ regressor = KernelRidge(**params)
504
  model = TransformedTargetRegressor(regressor=regressor,
505
  transformer=MinMaxScaler(feature_range=(-1, 1))
506
  ).fit(x_batch, y_batch)
 
516
  verbose=False)
517
  n_samples = np.minimum(1000, len(x_batch))
518
  features_umap = reducer.fit_transform(x_batch[:n_samples])
519
+ try: x = y_batch.values[:n_samples]
520
+ except: x = y_batch[:n_samples]
521
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
522
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
523
 
 
529
 
530
 
531
  elif downstream_model == "Linear Regression":
532
+ if params == None:
533
+ regressor = LinearRegression()
534
+ else:
535
+ regressor = LinearRegression(**params)
536
  model = TransformedTargetRegressor(regressor=regressor,
537
  transformer=MinMaxScaler(feature_range=(-1, 1))
538
  ).fit(x_batch, y_batch)
 
549
  n_samples = np.minimum(1000, len(x_batch))
550
  features_umap = reducer.fit_transform(x_batch[:n_samples])
551
  try:x = y_batch.values[:n_samples]
552
+ except: x = y_batch[:n_samples]
553
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
554
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
555
 
 
578
  n_samples = np.minimum(1000, len(x_batch))
579
  features_umap = reducer.fit_transform(x_batch[:n_samples])
580
  try:x = y_batch.values[:n_samples]
581
+ except: x = y_batch[:n_samples]
582
  # index_0 = [index for index in range(len(x)) if x[index] == 0]
583
  # index_1 = [index for index in range(len(x)) if x[index] == 1]
584
 
 
587
  print("Generating latent plots : Done")
588
 
589
  return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
590
+
591
 
592
+ def multi_modal(model_list,dataset=None, downstream_model=None,params=None, x_train=None, x_test=None, y_train=None, y_test=None):
593
+ #print(model_list)
 
594
  data = avail_datasets()
595
  df = pd.DataFrame(data)
596
  list(df["Dataset"].values)
 
598
  if dataset in list(df["Dataset"].values):
599
  task = dataset
600
  predefined = True
601
+ elif x_train==None:
602
  predefined = False
603
  components = dataset.split(",")
604
  train_data = pd.read_csv(components[0])[components[2]]
 
608
  y_batch_test = pd.read_csv(components[1])[components[3]]
609
 
610
  print("Custom Dataset loaded")
611
+ else:
612
+ predefined = False
613
+ y_batch = y_train
614
+ y_batch_test = y_test
615
+ train_data = x_train
616
+ test_data = x_test
617
 
618
  data = avail_models(raw=True)
619
  df = pd.DataFrame(data)
620
  list(df["Name"].values)
621
 
622
+ alias = {"MHG-GED":"mhg", "SELFIES-TED": "bart", "MolFormer":"mol-xl", "Molformer": "mol-xl","SMI-TED":"smi-ted", "Mordred": "Mordred", "MorganFingerprint": "MorganFingerprint"}
623
  #if set(model_list).issubset(list(df["Name"].values)):
624
  if set(model_list).issubset(list(alias.keys())):
625
  for i, model in enumerate(model_list):
 
630
 
631
  if i == 0:
632
  if predefined:
633
+ with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
634
  x_batch, y_batch, x_batch_test, y_batch_test = pickle.load(f1)
635
  print(f" Loaded representation/{task}_{model_type}.pkl")
636
  else:
 
640
 
641
  else:
642
  if predefined:
643
+ with open(f"representation/{task}_{model_type}.pkl", "rb") as f1:
644
  x_batch_1, y_batch_1, x_batch_test_1, y_batch_test_1 = pickle.load(f1)
645
  print(f" Loaded representation/{task}_{model_type}.pkl")
646
  else:
 
651
  x_batch = pd.concat([x_batch, x_batch_1], axis=1)
652
  x_batch_test = pd.concat([x_batch_test, x_batch_test_1], axis=1)
653
 
 
654
  else:
655
  print("Model not available")
656
  return
 
660
 
661
  num_columns = x_batch.shape[1]
662
  x_batch.columns = [f'{i + 1}' for i in range(num_columns)]
663
+
664
+ # exclude row containing Nan value
665
+ if isinstance(x_batch, torch.Tensor):
666
+ x_batch = pd.DataFrame(x_batch)
667
+ nan_indices = x_batch.index[x_batch.isna().any(axis=1)]
668
+ if len(nan_indices) > 0:
669
+ x_batch.dropna(inplace = True)
670
+ for index in sorted(nan_indices, reverse=True):
671
+ del y_batch[index]
672
+ print(f'x_batch Nan index: {nan_indices}')
673
+ print(f'x_batch shape: {x_batch.shape}, y_batch len: {len(y_batch)}')
674
+
675
+ if isinstance(x_batch_test, torch.Tensor):
676
+ x_batch_test = pd.DataFrame(x_batch_test)
677
+ nan_indices = x_batch_test.index[x_batch_test.isna().any(axis=1)]
678
+ if len(nan_indices) > 0:
679
+ x_batch_test.dropna(inplace = True)
680
+ for index in sorted(nan_indices, reverse=True):
681
+ del y_batch_test[index]
682
+ print(f'x_batch_test Nan index: {nan_indices}')
683
+ print(f'x_batch_test shape: {x_batch_test.shape}, y_batch_test len: {len(y_batch_test)}')
684
 
685
  print(f"Representations loaded successfully")
686
  try:
687
+ with open(f"plot_emb/{task}_multi.pkl", "rb") as f1:
688
  class_0, class_1 = pickle.load(f1)
689
  except:
690
  print("Generating latent plots")
 
694
  features_umap = reducer.fit_transform(x_batch[:n_samples])
695
 
696
  if "Classifier" in downstream_model:
697
+ try: x = y_batch.values[:n_samples]
698
+ except: x = y_batch[:n_samples]
699
  index_0 = [index for index in range(len(x)) if x[index] == 0]
700
  index_1 = [index for index in range(len(x)) if x[index] == 1]
701
 
 
712
 
713
 
714
  if downstream_model == "XGBClassifier":
715
+ if params == None:
716
+ xgb_predict_concat = XGBClassifier()
717
+ else:
718
+ xgb_predict_concat = XGBClassifier(**params)#n_estimators=5000, learning_rate=0.01, max_depth=10)
719
  xgb_predict_concat.fit(x_batch, y_batch)
720
 
721
  y_prob = xgb_predict_concat.predict_proba(x_batch_test)[:, 1]
 
753
  return result, roc_auc,fpr, tpr, class_0, class_1
754
 
755
  elif downstream_model == "SVR":
756
+ if params == None:
757
+ regressor = SVR()
758
+ else:
759
+ regressor = SVR(**params)
760
  model = TransformedTargetRegressor(regressor= regressor,
761
  transformer = MinMaxScaler(feature_range=(-1, 1))
762
  ).fit(x_batch,y_batch)
763
+
764
  y_prob = model.predict(x_batch_test)
765
  RMSE_score = np.sqrt(mean_squared_error(y_batch_test, y_prob))
766
+
767
  print(f"RMSE Score: {RMSE_score:.4f}")
768
  result = f"RMSE Score: {RMSE_score:.4f}"
769
+
770
  return result, RMSE_score,y_batch_test, y_prob, class_0, class_1
771
 
772
  elif downstream_model == "Linear Regression":
773
+ if params == None:
774
+ regressor = LinearRegression()
775
+ else:
776
+ regressor = LinearRegression(**params)
777
  model = TransformedTargetRegressor(regressor=regressor,
778
  transformer=MinMaxScaler(feature_range=(-1, 1))
779
  ).fit(x_batch, y_batch)
 
787
  return result, RMSE_score, y_batch_test, y_prob, class_0, class_1
788
 
789
  elif downstream_model == "Kernel Ridge":
790
+ if params == None:
791
+ regressor = KernelRidge()
792
+ else:
793
+ regressor = KernelRidge(**params)
794
  model = TransformedTargetRegressor(regressor=regressor,
795
  transformer=MinMaxScaler(feature_range=(-1, 1))
796
  ).fit(x_batch, y_batch)
 
819
 
820
 
821
 
822
+ def finetune_optuna(x_batch,y_batch, x_batch_test, y_test ):
823
+ print(f" Finetuning with Optuna and calculating ROC AUC Score ...")
824
+ X_train = x_batch.values
825
+ y_train = y_batch.values
826
+ X_test = x_batch_test.values
827
+ y_test = y_test.values
828
+ def objective(trial):
829
+ # Define parameters to be optimized
830
+ params = {
831
+ # 'objective': 'binary:logistic',
832
+ 'eval_metric': 'auc',
833
+ 'verbosity': 0,
834
+ 'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
835
+ # 'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
836
+ # 'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
837
+ 'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
838
+ 'max_depth': trial.suggest_int('max_depth', 1, 12),
839
+ # 'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
840
+ # 'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
841
+ # 'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
842
+ # "subsample": trial.suggest_float("subsample", 0.05, 1.0),
843
+ # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
844
+ }
845
+
846
+ # Train XGBoost model
847
+ dtrain = xgb.DMatrix(X_train, label=y_train)
848
+ dtest = xgb.DMatrix(X_test, label=y_test)
849
+
850
+ model = xgb.train(params, dtrain)
851
+
852
+ # Predict probabilities
853
+ y_pred = model.predict(dtest)
854
+
855
+ # Calculate ROC AUC score
856
+ roc_auc = roc_auc_score(y_test, y_pred)
857
+ print("ROC_AUC : ", roc_auc)
858
+
859
+ return roc_auc
860
+
861
+ def add_new_model():
862
+ models = avail_models(raw=True)
863
+
864
+ # Function to display models
865
+ def display_models():
866
+ for model in models:
867
+ model_display = f"Name: {model['Name']}, Description: {model['Description']}, Timestamp: {model['Timestamp']}"
868
+ print(model_display)
869
+
870
+ # Function to update models
871
+ def update_models(new_name, new_description, new_path):
872
+ new_model = {
873
+ "Name": new_name,
874
+ "Description": new_description,
875
+ "Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
876
+ #"path": new_path
877
+ }
878
+ models.append(new_model)
879
+ with open("models.json", "w") as outfile:
880
+ json.dump(models, outfile)
881
+
882
+ print("Model uploaded and updated successfully!")
883
+ list_models()
884
+ #display_models()
885
+
886
+ # Widgets
887
+ name_text = widgets.Text(description="Name:", layout=Layout(width='50%'))
888
+ description_text = widgets.Text(description="Description:", layout=Layout(width='50%'))
889
+ path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
890
+
891
+ def browse_callback(b):
892
+ root = tk.Tk()
893
+ root.withdraw() # Hide the main window
894
+ file_path = filedialog.askopenfilename(title="Select a Model File")
895
+ if file_path:
896
+ path_text.value = file_path
897
+
898
+ browse_button = widgets.Button(description="Browse")
899
+ browse_button.on_click(browse_callback)
900
+
901
+ def submit_callback(b):
902
+ update_models(name_text.value, description_text.value, path_text.value)
903
+
904
+ submit_button = widgets.Button(description="Submit")
905
+ submit_button.on_click(submit_callback)
906
+
907
+ # Display widgets
908
+ display(VBox([name_text, description_text, path_text, browse_button, submit_button]))
909
+
910
+
911
+ def add_new_dataset():
912
+ # Sample data
913
+ datasets = avail_datasets()
914
+
915
+ # Function to display models
916
+ def display_datasets():
917
+ for dataset in datasets:
918
+ dataset_display = f"Name: {dataset['Dataset']}, Input: {dataset['Input']},Output: {dataset['Output']},Path: {dataset['Path']}, Timestamp: {dataset['Timestamp']}"
919
+
920
+ # Function to update models
921
+ def update_datasets(new_dataset, new_input, new_output, new_path):
922
+ new_model = {
923
+ "Dataset": new_dataset,
924
+ "Input": new_input,
925
+ "Output": new_output,
926
+ "Timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
927
+ "Path": os.path.basename(new_path)
928
+ }
929
+ datasets.append(new_model)
930
+ with open("datasets.json", "w") as outfile:
931
+ json.dump(datasets, outfile)
932
+
933
+ print("Dataset uploaded and updated successfully!")
934
+ list_data()
935
+
936
+
937
+ # Widgets
938
+ dataset_text = widgets.Text(description="Dataset:", layout=Layout(width='50%'))
939
+ input_text = widgets.Text(description="Input:", layout=Layout(width='50%'))
940
+ output_text = widgets.Text(description="Output:", layout=Layout(width='50%'))
941
+ path_text = widgets.Text(description="Path:", layout=Layout(width='50%'))
942
+
943
+ def browse_callback(b):
944
+ root = tk.Tk()
945
+ root.withdraw() # Hide the main window
946
+ file_path = filedialog.askopenfilename(title="Select a Dataset File")
947
+ if file_path:
948
+ path_text.value = file_path
949
+
950
+ browse_button = widgets.Button(description="Browse")
951
+ browse_button.on_click(browse_callback)
952
+
953
+ def submit_callback(b):
954
+ update_datasets(dataset_text.value, input_text.value, output_text.value, path_text.value)
955
+
956
+ submit_button = widgets.Button(description="Submit")
957
+ submit_button.on_click(submit_callback)
958
+
959
+ display(VBox([dataset_text, input_text, output_text, path_text, browse_button, submit_button]))
960
 
961
 
962
 
models/mhg_model/README.md CHANGED
@@ -27,7 +27,7 @@ In addition, the decoder inherits the theoretical guarantee of MHG on always gen
27
 
28
  ### Pretrained Models and Training Logs
29
 
30
- We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link]()
31
 
32
  Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
33
 
 
27
 
28
  ### Pretrained Models and Training Logs
29
 
30
+ We provide checkpoints of the MHG-GNN model pre-trained on a dataset of ~1.34M molecules curated from PubChem. (later) For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.mhg-ged/blob/main/mhggnn_pretrained_model_0724_2023.pickle)
31
 
32
  Add the MHG-GNN `pre-trained weights.pt` to the `models/` directory according to your needs.
33
 
models/mhg_model/images/mhg_example.png CHANGED

Git LFS Details

  • SHA256: 6ccfc7e1d40b44a82b17ef7db8d2b030e14d66cde3a0d641905b0e2b4a07abca
  • Pointer size: 130 Bytes
  • Size of remote file: 45.7 kB
models/mhg_model/images/mhg_example1.png CHANGED

Git LFS Details

  • SHA256: 18cd136996a79cacf1933d4263817351850cf6c9073633354172b26574540e45
  • Pointer size: 131 Bytes
  • Size of remote file: 270 kB
models/mhg_model/images/mhg_example2.png CHANGED

Git LFS Details

  • SHA256: 6cd5f2075efea13f79685f4f94e89efafd08358ef489f5bcda264770c76e528d
  • Pointer size: 130 Bytes
  • Size of remote file: 93 kB
models/mhg_model/load.py CHANGED
@@ -17,6 +17,7 @@ from typing_extensions import Self
17
 
18
  from .graph_grammar.io.smi import hg_to_mol
19
  from .models.mhgvae import GrammarGINVAE
 
20
  from huggingface_hub import hf_hub_download
21
 
22
 
@@ -73,12 +74,30 @@ class PretrainedModelWrapper:
73
  return output
74
 
75
 
76
- def load(model_name: str = "models/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle") -> Optional[
77
  PretrainedModelWrapper]:
 
78
  repo_id = "ibm/materials.mhg-ged"
79
- filename = "mhggnn_pretrained_model_0724_2023.pickle"
80
  file_path = hf_hub_download(repo_id=repo_id, filename=filename)
81
  with open(file_path, "rb") as f:
82
- model_dict = pickle.load(f)
83
  return PretrainedModelWrapper(model_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  return None
 
17
 
18
  from .graph_grammar.io.smi import hg_to_mol
19
  from .models.mhgvae import GrammarGINVAE
20
+
21
  from huggingface_hub import hf_hub_download
22
 
23
 
 
74
  return output
75
 
76
 
77
+ def load(model_name: str = "mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle") -> Optional[
78
  PretrainedModelWrapper]:
79
+
80
  repo_id = "ibm/materials.mhg-ged"
81
+ filename = "pytorch_model.bin" #"mhggnn_pretrained_model_0724_2023.pickle"
82
  file_path = hf_hub_download(repo_id=repo_id, filename=filename)
83
  with open(file_path, "rb") as f:
84
+ model_dict = torch.load(f)
85
  return PretrainedModelWrapper(model_dict)
86
+
87
+
88
+ """try:
89
+ if os.path.isfile(model_name):
90
+ with open(model_name, "rb") as f:
91
+ model_dict = pickle.load(f)
92
+ print("MHG Model Loaded")
93
+ return PretrainedModelWrapper(model_dict)
94
+
95
+ except:
96
+
97
+ for p in sys.path:
98
+ file = p + "/" + model_name
99
+ if os.path.isfile(file):
100
+ with open(file, "rb") as f:
101
+ model_dict = pickle.load(f)
102
+ return PretrainedModelWrapper(model_dict)"""
103
  return None
models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf CHANGED
Binary files a/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf and b/models/mhg_model/paper/MHG-GNN_Combination of Molecular Hypergraph Grammar with Graph Neural Network.pdf differ
 
models/selfies_model/selfies-ted.png CHANGED

Git LFS Details

  • SHA256: 1229d74cd9473344d9907f5b8b2ae22694bdd77e94d3ae8f1f8dadacf538ee9e
  • Pointer size: 130 Bytes
  • Size of remote file: 47.6 kB
models/selfies_ted/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: feature-extraction
5
+ tags:
6
+ - chemistry
7
+ ---
8
+
9
+ # selfies-ted
10
+
11
+ selfies-ted is a project for encoding SMILES (Simplified Molecular Input Line Entry System) into SELFIES (SELF-referencing Embedded Strings) and generating embeddings for molecular representations.
12
+
13
+ ![selfies-ted](selfies-ted.png)
14
+ ## Model Architecture
15
+
16
+ Configuration details
17
+
18
+ Encoder and Decoder FFN dimensions: 256
19
+ Number of attention heads: 4
20
+ Number of encoder and decoder layers: 2
21
+ Total number of hidden layers: 6
22
+ Maximum position embeddings: 128
23
+ Model dimension (d_model): 256
24
+
25
+ ## Pretrained Models and Training Logs
26
+ We provide checkpoints of the selfies-ted model pre-trained on a dataset of molecules curated from PubChem. The pre-trained model shows competitive performance on molecular representation tasks. For model weights: "HuggingFace link".
27
+
28
+ To install and use the pre-trained model:
29
+
30
+ Download the selfies_ted_model.pkl file from the "HuggingFace link".
31
+ Add the selfies-ted selfies_ted_model.pkl to the models/ directory. The directory structure should look like the following:
32
+
33
+ ```
34
+ models/
35
+ └── selfies_ted_model.pkl
36
+ ```
37
+
38
+ ## Installation
39
+
40
+ To use this project, you'll need to install the required dependencies. We recommend using a virtual environment:
41
+
42
+ ```bash
43
+ python -m venv venv
44
+ source venv/bin/activate # On Windows use `venv\Scripts\activate`
45
+ ```
46
+
47
+ Install the required dependencies
48
+
49
+ ```
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+
54
+ ## Usage
55
+
56
+ ### Import
57
+
58
+ ```
59
+ import load
60
+ ```
61
+ ### Training the Model
62
+
63
+ To train the model, use the train.py script:
64
+
65
+ ```
66
+ python train.py -f <path_to_your_data_file>
67
+ ```
68
+
69
+
70
+ Note: The actual usage may depend on the specific implementation in load.py. Please refer to the source code for detailed functionality.
71
+
72
+ ### Load the model and tokenizer
73
+ ```
74
+ load.load("path/to/checkpoint.pkl")
75
+ ```
76
+ ### Encode SMILES strings
77
+ ```
78
+ smiles_list = ["COC", "CCO"]
79
+ ```
80
+ ```
81
+ embeddings = load.encode(smiles_list)
82
+ ```
83
+
84
+
85
+ ## Example Notebook
86
+
87
+ Example notebook of this project is `selfies-ted-example.ipynb`.
models/selfies_ted/load.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import selfies as sf # selfies>=2.1.1
5
+ import pickle
6
+ import pandas as pd
7
+ import numpy as np
8
+ from datasets import Dataset
9
+ from rdkit import Chem
10
+ from transformers import AutoTokenizer, AutoModel
11
+
12
+
13
+ class SELFIES(torch.nn.Module):
14
+
15
+ def __init__(self):
16
+ super().__init__()
17
+ self.model = None
18
+ self.tokenizer = None
19
+ self.invalid = []
20
+
21
+ def get_selfies(self, smiles_list):
22
+ self.invalid = []
23
+ spaced_selfies_batch = []
24
+ for i, smiles in enumerate(smiles_list):
25
+ try:
26
+ selfies = sf.encoder(smiles.rstrip())
27
+ except:
28
+ try:
29
+ smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles.rstrip()))
30
+ selfies = sf.encoder(smiles)
31
+ except:
32
+ selfies = "[]"
33
+ self.invalid.append(i)
34
+
35
+ spaced_selfies_batch.append(selfies.replace('][', '] ['))
36
+
37
+ return spaced_selfies_batch
38
+
39
+
40
+ def get_embedding(self, selfies):
41
+ encoding = self.tokenizer(selfies["selfies"], return_tensors='pt', max_length=128, truncation=True, padding='max_length')
42
+ input_ids = encoding['input_ids']
43
+ attention_mask = encoding['attention_mask']
44
+ outputs = self.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
45
+ model_output = outputs.last_hidden_state
46
+
47
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
48
+ sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
49
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
50
+ model_output = sum_embeddings / sum_mask
51
+
52
+ del encoding['input_ids']
53
+ del encoding['attention_mask']
54
+
55
+ encoding["embedding"] = model_output
56
+
57
+ return encoding
58
+
59
+
60
+ def load(self, checkpoint="bart-2908.pickle"):
61
+ """
62
+ inputs :
63
+ checkpoint (pickle object)
64
+ """
65
+
66
+ self.tokenizer = AutoTokenizer.from_pretrained("ibm/materials.selfies-ted")
67
+ self.model = AutoModel.from_pretrained("ibm/materials.selfies-ted")
68
+
69
+
70
+
71
+
72
+
73
+ # TODO: remove `use_gpu` argument in validation pipeline
74
+ def encode(self, smiles_list=[], use_gpu=False, return_tensor=False):
75
+ """
76
+ inputs :
77
+ checkpoint (pickle object)
78
+ :return: embedding
79
+ """
80
+ selfies = self.get_selfies(smiles_list)
81
+ selfies_df = pd.DataFrame(selfies,columns=["selfies"])
82
+ data = Dataset.from_pandas(selfies_df)
83
+ embedding = data.map(self.get_embedding, batched=True, num_proc=1, batch_size=128)
84
+ emb = np.asarray(embedding["embedding"].copy())
85
+
86
+ for idx in self.invalid:
87
+ emb[idx] = np.nan
88
+ print("Cannot encode {0} to selfies and embedding replaced by NaN".format(smiles_list[idx]))
89
+
90
+ if return_tensor:
91
+ return torch.tensor(emb)
92
+ return pd.DataFrame(emb)
models/selfies_ted/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.1.0
2
+ transformers>=4.38
3
+ numpy>=1.26.1
4
+ datasets>=2.13.1
5
+ evaluate>=0.4.0
6
+ selfies>=2.1.0
7
+ scikit-learn>=1.2.1
8
+ pyarrow>=14.0.1
9
+ requests>=2.31.0
10
+ urllib3>=2.0.7
11
+ aiohttp>=3.9.0
12
+ zipp>=3.17.0
models/selfies_ted/selfies-ted-example.ipynb ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9d9b6eb8-9edb-44bd-9e5a-3a6ea67f5117",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Import library"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "c3ac4418",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "from load import SELFIES"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "markdown",
23
+ "id": "790061cf-5470-4564-987e-aa2e492337db",
24
+ "metadata": {},
25
+ "source": [
26
+ "### Initialize and load"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "id": "85847f26-e2f4-475a-a88e-41fd9cccfc0f",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "model = SELFIES()"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 3,
42
+ "id": "095e864c",
43
+ "metadata": {
44
+ "scrolled": true
45
+ },
46
+ "outputs": [],
47
+ "source": [
48
+ "model.load(checkpoint=\"bart-2908.pickle\")"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "id": "55f1a68c-c462-4dee-9139-9befb469f176",
54
+ "metadata": {},
55
+ "source": [
56
+ "### Example to get embeddings"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 4,
62
+ "id": "2357ef0a",
63
+ "metadata": {},
64
+ "outputs": [
65
+ {
66
+ "data": {
67
+ "application/vnd.jupyter.widget-view+json": {
68
+ "model_id": "b494cbf9878a4f5c8f4093e38fb82fd5",
69
+ "version_major": 2,
70
+ "version_minor": 0
71
+ },
72
+ "text/plain": [
73
+ "Map: 0%| | 0/3 [00:00<?, ? examples/s]"
74
+ ]
75
+ },
76
+ "metadata": {},
77
+ "output_type": "display_data"
78
+ }
79
+ ],
80
+ "source": [
81
+ "smiles_list = [\"CCO\", \"O=C=O\", \"OC(=O)c1ccccc1C(=O)O\"]\n",
82
+ "embeddings = model.encode(smiles_list)"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 5,
88
+ "id": "3871c513-d0a9-4e70-9c18-3f0b491e07b2",
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "data": {
93
+ "text/plain": [
94
+ "(3, 1024)"
95
+ ]
96
+ },
97
+ "execution_count": 5,
98
+ "metadata": {},
99
+ "output_type": "execute_result"
100
+ }
101
+ ],
102
+ "source": [
103
+ "embeddings.shape"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "289a8795-d6d8-4828-b2b2-b4d4a97a4604",
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": []
113
+ }
114
+ ],
115
+ "metadata": {
116
+ "kernelspec": {
117
+ "display_name": "Python 3 (ipykernel)",
118
+ "language": "python",
119
+ "name": "python3"
120
+ },
121
+ "language_info": {
122
+ "codemirror_mode": {
123
+ "name": "ipython",
124
+ "version": 3
125
+ },
126
+ "file_extension": ".py",
127
+ "mimetype": "text/x-python",
128
+ "name": "python",
129
+ "nbconvert_exporter": "python",
130
+ "pygments_lexer": "ipython3",
131
+ "version": "3.10.8"
132
+ }
133
+ },
134
+ "nbformat": 4,
135
+ "nbformat_minor": 5
136
+ }
models/selfies_ted/selfies-ted.png ADDED

Git LFS Details

  • SHA256: 1229d74cd9473344d9907f5b8b2ae22694bdd77e94d3ae8f1f8dadacf538ee9e
  • Pointer size: 130 Bytes
  • Size of remote file: 47.6 kB
models/smi_ted/.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model weights
2
+ inference/smi_ted_light/smi-ted-Light_40.pt
3
+
4
+ # pyenv
5
+ .python-version
6
+
7
+ # Environments
8
+ .env
9
+ .venv
10
+ env/
11
+ venv/
12
+ ENV/
13
+ env.bak/
14
+ venv.bak/
15
+
16
+ # editor files
17
+ .vscode/
18
+ .DS_Store
models/smi_ted/README.md ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SMILES-based Transformer Encoder-Decoder (SMI-TED)
2
+
3
+ This repository provides PyTorch source code associated with our publication, "A Large Encoder-Decoder Family of Foundation Models for Chemical Language".
4
+
5
+ **Paper:** [Arxiv Link](https://arxiv.org/abs/2407.20267)
6
+
7
+ **HuggingFace:** [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
8
+
9
+ For more information contact: [email protected] or [email protected].
10
+
11
+ ![ted-smi](images/smi-ted.png)
12
+
13
+ ## Introduction
14
+
15
+ We present a large encoder-decoder chemical foundation model, SMILES-based Transformer Encoder-Decoder (SMI-TED), pre-trained on a curated dataset of 91 million SMILES samples sourced from PubChem, equivalent to 4 billion molecular tokens. SMI-TED supports various complex tasks, including quantum property prediction, with two main variants ($289M$ and $8 \times 289M$). Our experiments across multiple benchmark datasets demonstrate state-of-the-art performance for various tasks. Model weights are available at: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted).
16
+
17
+ ## Table of Contents
18
+
19
+ 1. [Getting Started](#getting-started)
20
+ 1. [Pretrained Models and Training Logs](#pretrained-models-and-training-logs)
21
+ 2. [Replicating Conda Environment](#replicating-conda-environment)
22
+ 2. [Pretraining](#pretraining)
23
+ 3. [Finetuning](#finetuning)
24
+ 4. [Feature Extraction](#feature-extraction)
25
+ 5. [Citations](#citations)
26
+
27
+ ## Getting Started
28
+
29
+ **This code and environment have been tested on Nvidia V100s and Nvidia A100s**
30
+
31
+ ### Pretrained Models and Training Logs
32
+
33
+ We provide checkpoints of the SMI-TED model pre-trained on a dataset of ~91M molecules curated from PubChem. The pre-trained model shows competitive performance on classification and regression benchmarks from MoleculeNet. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
34
+
35
+ Add the SMI-TED `pre-trained weights.pt` to the `inference/` or `finetune/` directory according to your needs. The directory structure should look like the following:
36
+
37
+ ```
38
+ inference/
39
+ ├── smi_ted_light
40
+ │ ├── smi_ted_light.pt
41
+ │ ├── bert_vocab_curated.txt
42
+ │ └── load.py
43
+ ```
44
+ and/or:
45
+
46
+ ```
47
+ finetune/
48
+ ├── smi_ted_light
49
+ │ ├── smi_ted_light.pt
50
+ │ ├── bert_vocab_curated.txt
51
+ │ └── load.py
52
+ ```
53
+
54
+ ### Replicating Conda Environment
55
+
56
+ Follow these steps to replicate our Conda environment and install the necessary libraries:
57
+
58
+ #### Create and Activate Conda Environment
59
+
60
+ ```
61
+ conda create --name smi-ted-env python=3.10
62
+ conda activate smi-ted-env
63
+ ```
64
+
65
+ #### Install Packages with Conda
66
+
67
+ ```
68
+ conda install pytorch=2.1.0 pytorch-cuda=11.8 -c pytorch -c nvidia
69
+ ```
70
+
71
+ #### Install Packages with Pip
72
+
73
+ ```
74
+ pip install -r requirements.txt
75
+ pip install pytorch-fast-transformers
76
+ ```
77
+
78
+ ## Pretraining
79
+
80
+ For pretraining, we use two strategies: the masked language model method to train the encoder part and an encoder-decoder strategy to refine SMILES reconstruction and improve the generated latent space.
81
+
82
+ SMI-TED is pre-trained on canonicalized and curated 91M SMILES from PubChem with the following constraints:
83
+
84
+ - Compounds are filtered to a maximum length of 202 tokens during preprocessing.
85
+ - A 95/5/0 split is used for encoder training, with 5% of the data for decoder pretraining.
86
+ - A 100/0/0 split is also used to train the encoder and decoder directly, enhancing model performance.
87
+
88
+ The pretraining code provides examples of data processing and model training on a smaller dataset, requiring 8 A100 GPUs.
89
+
90
+ To pre-train the two variants of the SMI-TED model, run:
91
+
92
+ ```
93
+ bash training/run_model_light_training.sh
94
+ ```
95
+ or
96
+ ```
97
+ bash training/run_model_large_training.sh
98
+ ```
99
+
100
+ Use `train_model_D.py` to train only the decoder or `train_model_ED.py` to train both the encoder and decoder.
101
+
102
+ ## Finetuning
103
+
104
+ The finetuning datasets and environment can be found in the [finetune](finetune/) directory. After setting up the environment, you can run a finetuning task with:
105
+
106
+ ```
107
+ bash finetune/smi_ted_light/esol/run_finetune_esol.sh
108
+ ```
109
+
110
+ Finetuning training/checkpointing resources will be available in directories named `checkpoint_<measure_name>`.
111
+
112
+ ## Feature Extraction
113
+
114
+ The example notebook [smi_ted_encoder_decoder_example.ipynb](notebooks/smi_ted_encoder_decoder_example.ipynb) contains code to load checkpoint files and use the pre-trained model for encoder and decoder tasks. It also includes examples of classification and regression tasks. For model weights: [HuggingFace Link](https://huggingface.co/ibm/materials.smi-ted)
115
+
116
+ To load smi-ted, you can simply use:
117
+
118
+ ```python
119
+ model = load_smi_ted(
120
+ folder='../inference/smi_ted_light',
121
+ ckpt_filename='smi_ted_light.pt'
122
+ )
123
+ ```
124
+
125
+ To encode SMILES into embeddings, you can use:
126
+
127
+ ```python
128
+ with torch.no_grad():
129
+ encoded_embeddings = model.encode(df['SMILES'], return_torch=True)
130
+ ```
131
+ For decoder, you can use the function, so you can return from embeddings to SMILES strings:
132
+
133
+ ```python
134
+ with torch.no_grad():
135
+ decoded_smiles = model.decode(encoded_embeddings)
136
+ ```
137
+
138
+
models/smi_ted/finetune/args.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def get_parser(parser=None):
5
+ if parser is None:
6
+ parser = argparse.ArgumentParser()
7
+
8
+ # Model
9
+ # model_arg = parser.add_argument_group('Model')
10
+ parser.add_argument("--n_head", type=int, default=8, help="GPT number of heads")
11
+ parser.add_argument("--n_layer", type=int, default=12, help="GPT number of layers")
12
+ parser.add_argument(
13
+ "--q_dropout", type=float, default=0.5, help="Encoder layers dropout"
14
+ )
15
+ parser.add_argument(
16
+ "--d_dropout", type=float, default=0.1, help="Decoder layers dropout"
17
+ )
18
+ parser.add_argument(
19
+ "--n_embd", type=int, default=768, help="Latent vector dimensionality"
20
+ )
21
+ parser.add_argument(
22
+ "--fc_h", type=int, default=512, help="Fully connected hidden dimensionality"
23
+ )
24
+ parser.add_argument("--n_output", type=int, default=1)
25
+
26
+ # Train
27
+ # train_arg = parser.add_argument_group('Train')
28
+ parser.add_argument("--n_batch", type=int, default=512, help="Batch size")
29
+ parser.add_argument(
30
+ "--unlike_alpha", type=float, default=1.0, help="unlikelihood loss alpha weight"
31
+ )
32
+ parser.add_argument(
33
+ "--from_scratch",
34
+ action="store_true",
35
+ default=False,
36
+ help="train on qm9 from scratch",
37
+ )
38
+ parser.add_argument(
39
+ "--unlikelihood",
40
+ action="store_true",
41
+ default=False,
42
+ help="use unlikelihood loss with gpt pretrain",
43
+ )
44
+ parser.add_argument(
45
+ "--grad_acc",
46
+ type=int,
47
+ default=1,
48
+ help="number of batches to accumulate gradients",
49
+ )
50
+ parser.add_argument(
51
+ "--checkpoint_every",
52
+ type=int,
53
+ default=1000,
54
+ help="save checkpoint every x iterations",
55
+ )
56
+ parser.add_argument(
57
+ "--clip_grad", type=int, default=50, help="Clip gradients to this value"
58
+ )
59
+ parser.add_argument(
60
+ "--lr_start", type=float, default=3 * 1e-4, help="Initial lr value"
61
+ )
62
+ parser.add_argument(
63
+ "--lr_end", type=float, default=3 * 1e-4, help="Maximum lr weight value"
64
+ )
65
+ parser.add_argument(
66
+ "--lr_multiplier", type=int, default=1, help="lr weight multiplier"
67
+ )
68
+ parser.add_argument(
69
+ "--n_last", type=int, default=1000, help="Number of iters to smooth loss calc"
70
+ )
71
+ parser.add_argument("--n_jobs", type=int, default=1, help="Number of threads")
72
+ parser.add_argument(
73
+ "--accelerator",
74
+ type=str,
75
+ default="ddp",
76
+ help="The accelerator backend to use (previously known as distributed_backend)",
77
+ )
78
+ parser.add_argument(
79
+ "--num_nodes",
80
+ type=int,
81
+ default=1,
82
+ help="number of GPU nodes for distributed training",
83
+ )
84
+ parser.add_argument(
85
+ "--device",
86
+ type=str,
87
+ default="cuda",
88
+ help='Device to run: "cpu" or "cuda:<device number>"',
89
+ )
90
+ parser.add_argument("--seed", type=int, default=12345, help="Seed")
91
+ parser.add_argument(
92
+ "--init_params_from",
93
+ type=str,
94
+ default="",
95
+ help="Path to a ckpt used to initialize the parameters if no restart_path is provided",
96
+ )
97
+ parser.add_argument(
98
+ "--train_decoder_every",
99
+ type=int,
100
+ default=10,
101
+ help="Optimize decoder params every n batches",
102
+ )
103
+ parser.add_argument(
104
+ "--lr_decoder", type=float, default=1e-4, help="Learning rate for decoder part"
105
+ )
106
+ parser.add_argument(
107
+ "--local_rank",
108
+ type=int,
109
+ default=-1,
110
+ help="local_rank for distributed training on gpus",
111
+ )
112
+ parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.")
113
+ parser.add_argument(
114
+ "--dist-backend", default="nccl", type=str, help="distributed backend"
115
+ )
116
+ parser.add_argument(
117
+ "--tensorboard_path", default="./runs/deepspeed", help="tensorboard log dir"
118
+ )
119
+
120
+ # common_arg = parser.add_argument_group('Common')
121
+ parser.add_argument(
122
+ "--vocab_load", type=str, required=False, help="Where to load the vocab"
123
+ )
124
+ parser.add_argument(
125
+ "--n_samples", type=int, required=False, help="Number of samples to sample"
126
+ )
127
+ parser.add_argument(
128
+ "--gen_save", type=str, required=False, help="Where to save the gen molecules"
129
+ )
130
+ parser.add_argument(
131
+ "--max_len", type=int, default=100, help="Max of length of SMILES"
132
+ )
133
+ parser.add_argument(
134
+ "--train_load", type=str, required=False, help="Where to load the model"
135
+ )
136
+ parser.add_argument(
137
+ "--val_load", type=str, required=False, help="Where to load the model"
138
+ )
139
+ parser.add_argument(
140
+ "--n_workers",
141
+ type=int,
142
+ required=False,
143
+ default=1,
144
+ help="Where to load the model",
145
+ )
146
+ # beam search hyper parameters
147
+ parser.add_argument(
148
+ "--beam_size", type=int, default=0, help="Number of beams to generate"
149
+ )
150
+ parser.add_argument(
151
+ "--num_seq_returned",
152
+ type=int,
153
+ default=0,
154
+ help="number of beams to be returned (must be <= beam_size",
155
+ )
156
+ parser.add_argument(
157
+ "--min_len", type=int, default=1, help="minimum length to be generated"
158
+ )
159
+ parser.add_argument(
160
+ "--nucleus_thresh", type=float, default=0.9, help="nucleus sampling threshold"
161
+ )
162
+ parser.add_argument(
163
+ "--finetune_path",
164
+ type=str,
165
+ default="",
166
+ help="path to trainer file to continue training",
167
+ )
168
+ parser.add_argument(
169
+ "--restart_path",
170
+ type=str,
171
+ default="",
172
+ help="path to trainer file to continue training",
173
+ )
174
+ parser.add_argument(
175
+ "--data_path", type=str, default="", help="path to pubchem file"
176
+ )
177
+ parser.add_argument(
178
+ "--pretext_size", type=int, default=0, help="number of k-mers to pretext"
179
+ )
180
+ parser.add_argument(
181
+ "--model_save_dir",
182
+ type=str,
183
+ required=False,
184
+ default="./models_dump/",
185
+ help="Where to save the models/log/config/vocab",
186
+ )
187
+ parser.add_argument(
188
+ "--model_save",
189
+ type=str,
190
+ required=False,
191
+ default="model.pt",
192
+ help="Where to save the model",
193
+ )
194
+ # parser.add_argument('--save_frequency',
195
+ # type=int, default=20,
196
+ # help='How often to save the model')
197
+ parser.add_argument(
198
+ "--num_epoch", type=int, default=1, help="number of epochs to train"
199
+ )
200
+ # parser.add_argument('--num_iter',
201
+ # type=int, default=-1,
202
+ # help='how many itersations per epoch (for unlikelihood tuning)')
203
+ parser.add_argument(
204
+ "--log_file", type=str, required=False, help="Where to save the log"
205
+ )
206
+ parser.add_argument(
207
+ "--tb_loc",
208
+ type=str,
209
+ required=False,
210
+ help="Where to save the tensorflow location",
211
+ )
212
+ parser.add_argument(
213
+ "--config_save", type=str, required=False, help="Where to save the config"
214
+ )
215
+ parser.add_argument("--vocab_save", type=str, help="Where to save the vocab")
216
+
217
+ # resume_arg = parser.add_argument_group('Resume')
218
+ parser.add_argument(
219
+ "--debug",
220
+ default=False,
221
+ action="store_true",
222
+ help="do not erase cache at end of program",
223
+ )
224
+ parser.add_argument(
225
+ "--fast_dev_run",
226
+ default=False,
227
+ help="This flag runs a “unit test” by running n if set to n (int) else 1 if set to True training and validation batch(es).",
228
+ )
229
+ parser.add_argument(
230
+ "--freeze_model",
231
+ default=False,
232
+ action="store_true",
233
+ help="freeze weights of bert model during fine tuning",
234
+ )
235
+ parser.add_argument(
236
+ "--resume", default=False, action="store_true", help="Resume from a saved model"
237
+ )
238
+ parser.add_argument(
239
+ "--rotate",
240
+ default=False,
241
+ action="store_true",
242
+ help="use rotational relative embedding",
243
+ )
244
+ parser.add_argument(
245
+ "--model_load", type=str, required=False, help="Where to load the model"
246
+ )
247
+ parser.add_argument(
248
+ "--root_dir", type=str, required=False, default=".", help="location of root dir"
249
+ )
250
+ parser.add_argument(
251
+ "--config_load", type=str, required=False, help="Where to load the config"
252
+ )
253
+ parser.add_argument(
254
+ "--gpus", type=int, required=False, default=1, help="number of gpus to use"
255
+ )
256
+ # parser.add_argument('--start_epoch',
257
+ # type=int, required=False, default=0,
258
+ # help='Where to load the config')
259
+
260
+ parser.add_argument(
261
+ "--model_arch",
262
+ type=str,
263
+ required=False,
264
+ help="used to teack model arch in params",
265
+ )
266
+ parser.add_argument(
267
+ "--eval_every",
268
+ type=int,
269
+ default=50000,
270
+ help="run evaluation every x iterations",
271
+ )
272
+ parser.add_argument(
273
+ "--num_feats",
274
+ type=int,
275
+ required=False,
276
+ default=32,
277
+ help="number of random reatures for FAVOR+",
278
+ )
279
+ parser.add_argument(
280
+ "--max_epochs", type=int, required=False, default=1, help="max number of epochs"
281
+ )
282
+
283
+ # debug() FINE TUNEING
284
+ # parser.add_argument('--save_dir', type=str, required=True)
285
+ parser.add_argument(
286
+ "--mode", type=str, default="cls", help="type of pooling to use"
287
+ )
288
+ parser.add_argument("--dataset_length", type=int, default=None, required=False)
289
+ parser.add_argument("--num_workers", type=int, default=0, required=False)
290
+ parser.add_argument("--dropout", type=float, default=0.1, required=False)
291
+ # parser.add_argument("--dims", type=int, nargs="*", default="", required=False)
292
+ parser.add_argument(
293
+ "--smiles_embedding",
294
+ type=str,
295
+ default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/embeddings/protein/ba_embeddings_tanh_512_2986138_2.pt",
296
+ )
297
+ # parser.add_argument("--train_pct", type=str, required=False, default="95")
298
+ # parser.add_argument("--aug", type=int, required=True)
299
+ parser.add_argument("--dataset_name", type=str, required=False, default="sol")
300
+ parser.add_argument("--measure_name", type=str, required=False, default="measure")
301
+ # parser.add_argument("--emb_type", type=str, required=True)
302
+ parser.add_argument("--checkpoints_folder", type=str, required=True)
303
+ # parser.add_argument("--results_dir", type=str, required=True)
304
+ # parser.add_argument("--patience_epochs", type=int, required=True)
305
+ parser.add_argument("--model_path", type=str, default="./smi_ted/")
306
+ parser.add_argument("--ckpt_filename", type=str, default="smi_ted_Light_40.pt")
307
+ parser.add_argument("--restart_filename", type=str, default="")
308
+ # parser.add_argument('--n_output', type=int, default=1)
309
+ parser.add_argument("--save_every_epoch", type=int, default=0)
310
+ parser.add_argument("--save_ckpt", type=int, default=1)
311
+ parser.add_argument("--start_seed", type=int, default=0)
312
+ parser.add_argument("--smi_ted_version", type=str, default="v1")
313
+ parser.add_argument("--train_decoder", type=int, default=1)
314
+ parser.add_argument("--target_metric", type=str, default="rmse")
315
+ parser.add_argument("--loss_fn", type=str, default="mae")
316
+
317
+ parser.add_argument(
318
+ "--data_root",
319
+ type=str,
320
+ required=False,
321
+ default="/dccstor/medscan7/smallmolecule/runs/ba-predictor/small-data/affinity",
322
+ )
323
+ # parser.add_argument("--use_bn", type=int, default=0)
324
+ parser.add_argument("--use_linear", type=int, default=0)
325
+
326
+ parser.add_argument("--lr", type=float, default=0.001)
327
+ # parser.add_argument("--weight_decay", type=float, default=5e-4)
328
+ # parser.add_argument("--val_check_interval", type=float, default=1.0)
329
+ parser.add_argument("--batch_size", type=int, default=64)
330
+
331
+ return parser
332
+
333
+
334
+ def parse_args():
335
+ parser = get_parser()
336
+ args = parser.parse_args()
337
+ return args
models/smi_ted/finetune/finetune_classification.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deep learning
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch import optim
5
+ from trainers import TrainerClassifier
6
+ from utils import get_optim_groups
7
+
8
+ # Data
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ # Standard library
13
+ import args
14
+ import os
15
+
16
+
17
+ def main(config):
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
+
20
+ # load dataset
21
+ df_train = pd.read_csv(f"{config.data_root}/train.csv")
22
+ df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
23
+ df_test = pd.read_csv(f"{config.data_root}/test.csv")
24
+
25
+ # load model
26
+ if config.smi_ted_version == 'v1':
27
+ from smi_ted_light.load import load_smi_ted
28
+ elif config.smi_ted_version == 'v2':
29
+ from smi_ted_large.load import load_smi_ted
30
+
31
+ model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
32
+ model.net.apply(model._init_weights)
33
+ print(model.net)
34
+
35
+ lr = config.lr_start*config.lr_multiplier
36
+ optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
37
+ if config.loss_fn == 'crossentropy':
38
+ loss_function = nn.CrossEntropyLoss()
39
+
40
+ # init trainer
41
+ trainer = TrainerClassifier(
42
+ raw_data=(df_train, df_valid, df_test),
43
+ dataset_name=config.dataset_name,
44
+ target=config.measure_name,
45
+ batch_size=config.n_batch,
46
+ hparams=config,
47
+ target_metric=config.target_metric,
48
+ seed=config.start_seed,
49
+ smi_ted_version=config.smi_ted_version,
50
+ checkpoints_folder=config.checkpoints_folder,
51
+ restart_filename=config.restart_filename,
52
+ device=device,
53
+ save_every_epoch=bool(config.save_every_epoch),
54
+ save_ckpt=bool(config.save_ckpt)
55
+ )
56
+ trainer.compile(
57
+ model=model,
58
+ optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
59
+ loss_fn=loss_function
60
+ )
61
+ trainer.fit(max_epochs=config.max_epochs)
62
+ trainer.evaluate()
63
+
64
+
65
+ if __name__ == '__main__':
66
+ parser = args.get_parser()
67
+ config = parser.parse_args()
68
+ main(config)
models/smi_ted/finetune/finetune_classification_multitask.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deep learning
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch import optim
5
+ from trainers import TrainerClassifierMultitask
6
+ from utils import get_optim_groups
7
+
8
+ # Data
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ # Standard library
13
+ import args
14
+ import os
15
+
16
+
17
+ def main(config):
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
+
20
+ # Define Target and Causal Features
21
+ if config.dataset_name == 'tox21':
22
+ targets = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
23
+ 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
24
+ elif config.dataset_name == 'clintox':
25
+ targets = ['FDA_APPROVED', 'CT_TOX']
26
+ elif config.dataset_name == 'sider':
27
+ targets = [
28
+ 'Hepatobiliary disorders', 'Metabolism and nutrition disorders',
29
+ 'Product issues', 'Eye disorders', 'Investigations',
30
+ 'Musculoskeletal and connective tissue disorders',
31
+ 'Gastrointestinal disorders', 'Social circumstances',
32
+ 'Immune system disorders', 'Reproductive system and breast disorders',
33
+ 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
34
+ 'General disorders and administration site conditions',
35
+ 'Endocrine disorders', 'Surgical and medical procedures',
36
+ 'Vascular disorders', 'Blood and lymphatic system disorders',
37
+ 'Skin and subcutaneous tissue disorders',
38
+ 'Congenital, familial and genetic disorders', 'Infections and infestations',
39
+ 'Respiratory, thoracic and mediastinal disorders', 'Psychiatric disorders',
40
+ 'Renal and urinary disorders',
41
+ 'Pregnancy, puerperium and perinatal conditions',
42
+ 'Ear and labyrinth disorders', 'Cardiac disorders',
43
+ 'Nervous system disorders', 'Injury, poisoning and procedural complications'
44
+ ]
45
+ elif config.dataset_name == 'muv':
46
+ targets = [
47
+ 'MUV-466', 'MUV-548', 'MUV-600', 'MUV-644', 'MUV-652', 'MUV-689',
48
+ 'MUV-692', 'MUV-712', 'MUV-713', 'MUV-733', 'MUV-737', 'MUV-810',
49
+ 'MUV-832', 'MUV-846', 'MUV-852', 'MUV-858', 'MUV-859'
50
+ ]
51
+ config.n_output = len(targets)
52
+
53
+ # load dataset
54
+ df_train = pd.read_csv(f"{config.data_root}/train.csv")
55
+ df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
56
+ df_test = pd.read_csv(f"{config.data_root}/test.csv")
57
+
58
+ # load model
59
+ if config.smi_ted_version == 'v1':
60
+ from smi_ted_light.load import load_smi_ted
61
+ elif config.smi_ted_version == 'v2':
62
+ from smi_ted_large.load import load_smi_ted
63
+
64
+ model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=len(targets), eval=False)
65
+ model.net.apply(model._init_weights)
66
+ print(model.net)
67
+
68
+ lr = config.lr_start*config.lr_multiplier
69
+ optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
70
+ if config.loss_fn == 'bceloss':
71
+ loss_function = nn.BCELoss()
72
+
73
+ # init trainer
74
+ trainer = TrainerClassifierMultitask(
75
+ raw_data=(df_train, df_valid, df_test),
76
+ dataset_name=config.dataset_name,
77
+ target=targets,
78
+ batch_size=config.n_batch,
79
+ hparams=config,
80
+ target_metric=config.target_metric,
81
+ seed=config.start_seed,
82
+ smi_ted_version=config.smi_ted_version,
83
+ checkpoints_folder=config.checkpoints_folder,
84
+ restart_filename=config.restart_filename,
85
+ device=device,
86
+ save_every_epoch=bool(config.save_every_epoch),
87
+ save_ckpt=bool(config.save_ckpt)
88
+ )
89
+ trainer.compile(
90
+ model=model,
91
+ optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
92
+ loss_fn=loss_function
93
+ )
94
+ trainer.fit(max_epochs=config.max_epochs)
95
+ trainer.evaluate()
96
+
97
+
98
+ if __name__ == '__main__':
99
+ parser = args.get_parser()
100
+ config = parser.parse_args()
101
+ main(config)
models/smi_ted/finetune/finetune_regression.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deep learning
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch import optim
5
+ from trainers import TrainerRegressor
6
+ from utils import RMSELoss, get_optim_groups
7
+
8
+ # Data
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ # Standard library
13
+ import args
14
+ import os
15
+
16
+
17
+ def main(config):
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
+
20
+ # load dataset
21
+ df_train = pd.read_csv(f"{config.data_root}/train.csv")
22
+ df_valid = pd.read_csv(f"{config.data_root}/valid.csv")
23
+ df_test = pd.read_csv(f"{config.data_root}/test.csv")
24
+
25
+ # load model
26
+ if config.smi_ted_version == 'v1':
27
+ from smi_ted_light.load import load_smi_ted
28
+ elif config.smi_ted_version == 'v2':
29
+ from smi_ted_large.load import load_smi_ted
30
+
31
+ model = load_smi_ted(folder=config.model_path, ckpt_filename=config.ckpt_filename, n_output=config.n_output, eval=False)
32
+ model.net.apply(model._init_weights)
33
+ print(model.net)
34
+
35
+ lr = config.lr_start*config.lr_multiplier
36
+ optim_groups = get_optim_groups(model, keep_decoder=bool(config.train_decoder))
37
+ if config.loss_fn == 'rmse':
38
+ loss_function = RMSELoss()
39
+ elif config.loss_fn == 'mae':
40
+ loss_function = nn.L1Loss()
41
+
42
+ # init trainer
43
+ trainer = TrainerRegressor(
44
+ raw_data=(df_train, df_valid, df_test),
45
+ dataset_name=config.dataset_name,
46
+ target=config.measure_name,
47
+ batch_size=config.n_batch,
48
+ hparams=config,
49
+ target_metric=config.target_metric,
50
+ seed=config.start_seed,
51
+ smi_ted_version=config.smi_ted_version,
52
+ checkpoints_folder=config.checkpoints_folder,
53
+ restart_filename=config.restart_filename,
54
+ device=device,
55
+ save_every_epoch=bool(config.save_every_epoch),
56
+ save_ckpt=bool(config.save_ckpt)
57
+ )
58
+ trainer.compile(
59
+ model=model,
60
+ optimizer=optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.99)),
61
+ loss_fn=loss_function
62
+ )
63
+ trainer.fit(max_epochs=config.max_epochs)
64
+ trainer.evaluate()
65
+
66
+
67
+ if __name__ == '__main__':
68
+ parser = args.get_parser()
69
+ config = parser.parse_args()
70
+ main(config)
models/smi_ted/finetune/moleculenet/bace/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3af97c680375dd09349c63b4779b35166212302e79e4fc7a1752ef5d71cf35b
3
+ size 400436
models/smi_ted/finetune/moleculenet/bace/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5b3426e84dc7e2f40f2cf9d15d4d38328126c07f49c215cfb4fb657f69200de
3
+ size 3109699
models/smi_ted/finetune/moleculenet/bace/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:813c8f2af5a1058568cf60b7021b8b2cd818a17944afd0b09f9d838e36ee985d
3
+ size 397085
models/smi_ted/finetune/moleculenet/bbbp/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca4161c44535fd0f8ff917cc68d26703da7fbce19ddecb7dc5f7ae4b4d241a6
3
+ size 14874
models/smi_ted/finetune/moleculenet/bbbp/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7300807bf21ea1177efd81c218e43275ed00b6c3006b5dae7625f774edb6b1a6
3
+ size 115549
models/smi_ted/finetune/moleculenet/bbbp/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af39cc3735a356010a072e1e196a64eca6e0d88f0b2a023d4dc1adba7030ce40
3
+ size 15655
models/smi_ted/finetune/moleculenet/biodegradability/biodeg_example.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c98992c1c22ae7468a41fb7bc86c775ccc30fa29e50053bb148ffc2f2d95551e
3
+ size 6352
models/smi_ted/finetune/moleculenet/biodegradability/biodegradability.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ec61887444a0e8925b16cca48433c3b3bff1ac5cf08f448d6b64bbdbc14a318
3
+ size 416181
models/smi_ted/finetune/moleculenet/biodegradability/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c2f7f39add0fff77358454c0f1b289a233e4a78d50b7f005ec2dc1c632d473
3
+ size 84488
models/smi_ted/finetune/moleculenet/biodegradability/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a4a94ae0f8c134ce10f2d853eced84d031a4e7b394662344a9141e7567b3eb2
3
+ size 252230
models/smi_ted/finetune/moleculenet/biodegradability/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e827ee7e55544f5b327d5e2ef2d9fe09e3f62024e1316b6e71d1fc9be275a1
3
+ size 85290
models/smi_ted/finetune/moleculenet/clintox/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:963a05e8eeaaa38fd3688f448dfc28cd0917ea280b1b9cb5b4297244f7f68fe2
3
+ size 10219
models/smi_ted/finetune/moleculenet/clintox/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04bbee4a0d7fb4942292c9581f318909d06508d529a4a3a76590e6749417c1a7
3
+ size 74357
models/smi_ted/finetune/moleculenet/clintox/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e2b9ab566ffc184c0590002bfbd6a42e6522209e6d6271968262844dde2905
3
+ size 10255
models/smi_ted/finetune/moleculenet/esol/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da41a7eab447fdfd163292b4a5eb8ef09a747fc82b0f1cc5c468e46b1b2ef5a
3
+ size 9999
models/smi_ted/finetune/moleculenet/esol/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:784ba31de05a43ecab98260c94a47e2c807f4d65c0f93d9a88fbd962515976c5
3
+ size 77154
models/smi_ted/finetune/moleculenet/esol/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc30e7fa1f774e27ed56de7cfd77e21f07a5a2c38fcc6d928c0084a9a99181e5
3
+ size 9892
models/smi_ted/finetune/moleculenet/freesolv/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8212c391ccbff3722a11d1bd3752b3a9dd187f2a7b33f8b9d2d594950b188d7
3
+ size 3223
models/smi_ted/finetune/moleculenet/freesolv/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3b781e5d03dbd7d272347288161f92e8e66c628da50e3e2bc06de12225de22d
3
+ size 25053
models/smi_ted/finetune/moleculenet/freesolv/valid.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b35d9c13a02291eefe85bd4b048ccc28f5326a3b018beb937aba12067b072d2
3
+ size 3151
models/smi_ted/finetune/moleculenet/hiv/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e86ca708a331966f6e7b06621a2e221a9f6ce45f0141e6cbe919fd64ec50fc7
3
+ size 213176
models/smi_ted/finetune/moleculenet/hiv/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c289700d093d7ccbe55a583ad5cb3a670df931a19283ea66880413ed398358ff
3
+ size 1685863