Repository: MarcusOlivecrona/REINVENT Branch: master Commit: 752935c29d46 Files: 20 Total size: 103.0 MB Directory structure: gitextract_jbrxke6h/ ├── LICENSE ├── README.md ├── Vizard/ │ ├── main.py │ ├── run.sh │ ├── templates/ │ │ ├── index.html │ │ └── styles.css │ └── theme.yaml ├── data/ │ ├── ChEMBL_filtered │ ├── Prior.ckpt │ ├── Voc │ └── clf.pkl ├── data_structs.py ├── main.py ├── model.py ├── multiprocess.py ├── scoring_functions.py ├── train_agent.py ├── train_prior.py ├── utils.py └── vizard_logger.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ Copyright <2017> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # REINVENT ## Molecular De Novo design using Recurrent Neural Networks and Reinforcement Learning Searching chemical space as described in: [Molecular De Novo Design through Deep Reinforcement Learning](https://arxiv.org/abs/1704.07555) ![Video demonstrating an Agent trained to generate analogues to Celecoxib](https://github.com/MarcusOlivecrona/REINVENT/blob/master/images/celecoxib_analogues.gif "Training an Agent to generate analogues of Celecoxib") ## Notes The current version is a PyTorch implementation that differs in several ways from the original implementation described in the paper. This version works better in most situations and is better documented, but for the purpose of reproducing results from the paper refer to [Release v1.0.1](https://github.com/MarcusOlivecrona/REINVENT/releases/tag/v1.0.1) Differences from implmentation in the paper: * Written in PyTorch/Python3.6 rather than TF/Python2.7 * SMILES are encoded with token index rather than as a onehot of the index. An embedding matrix is then used to transform the token index to a feature vector. * Scores are in the range (0,1). * A regularizer that penalizes high values of total episodic likelihood is included. * Sequences are only considered once, ie if the same sequence is generated twice in a batch only the first instance contributes to the loss. * These changes makes the algorithm more robust towards local minima, means much higher values of sigma can be used if needed. ## Requirements This package requires: * Python 3.6 * PyTorch 0.1.12 * [RDkit](http://www.rdkit.org/docs/Install.html) * Scikit-Learn (for QSAR scoring function) * tqdm (for training Prior) * pexpect ## Usage To train a Prior starting with a SMILES file called mols.smi: * First filter the SMILES and construct a vocabulary from the remaining sequences. `./data_structs.py mols.smi` - Will generate data/mols_filtered.smi and data/Voc. A filtered file containing around 1.1 million SMILES and the corresponding Voc is contained in "data". * Then use `./train_prior.py` to train the Prior. A pretrained Prior is included. To train an Agent using our Prior, use the main.py script. For example: * `./main.py --scoring-function activity_model --num-steps 1000` Training can be visualized using the Vizard bokeh app. The vizard_logger.py is used to log information (by default to data/logs) such as structures generated, average score, and network weights. * `cd Vizard` * `./run.sh ../data/logs` * Open the browser at http://localhost:5006/Vizard ================================================ FILE: Vizard/main.py ================================================ from bokeh.plotting import figure, ColumnDataSource, curdoc from bokeh.models import CustomJS, Range1d from bokeh.models.glyphs import Text from bokeh.layouts import row, column, widgetbox, layout from bokeh.models.widgets import Div import bokeh.palettes from rdkit import Chem from rdkit.Chem import Draw from rdkit import rdBase import sys import os.path import numpy as np import math """Bokeh app that visualizes training progress for the De Novo design reinforcement learning. The app is updated dynamically using information that the train_agent.py script writes to a logging directory.""" rdBase.DisableLog('rdApp.error') error_msg = """Need to provide valid log directory as first argument. 'bokeh serve . --args [log_dir]'""" try: path = sys.argv[1] except IndexError: raise IndexError(error_msg) if not os.path.isdir(path): raise ValueError(error_msg) score_source = ColumnDataSource(data=dict(x=[], y=[], y_mean=[])) score_fig = figure(title="Scores", plot_width=600, plot_height=600) score_fig.line('x', 'y', legend='Average score', source=score_source) score_fig.line('x', 'y_mean', legend='Running average of average score', line_width=2, color="firebrick", source=score_source) score_fig.xaxis.axis_label = "Step" score_fig.yaxis.axis_label = "Average Score" score_fig.title.text_font_size = "20pt" score_fig.legend.location = "bottom_right" score_fig.css_classes = ["score_fig"] img_fig = Div(text="", width=850, height=590) img_fig.css_classes = ["img_outside"] def downsample(data, max_len): np.random.seed(0) if len(data)>max_len: data = np.random.choice(data, size=max_len, replace=False) return data def running_average(data, length): early_cumsum = np.cumsum(data[:length]) / np.arange(1, min(len(data), length) + 1) if len(data)>length: cumsum = np.cumsum(data) cumsum = (cumsum[length:] - cumsum[:-length]) / length cumsum = np.concatenate((early_cumsum, cumsum)) return cumsum return early_cumsum def create_bar_plot(init_data, title): init_data = downsample(init_data, 50) x = range(len(init_data)) source = ColumnDataSource(data=dict(x= [], y=[])) fig = figure(title=title, plot_width=300, plot_height=300) fig.vbar(x=x, width=1, top=init_data, fill_alpha=0.05) fig.vbar('x', width=1, top='y', fill_alpha=0.3, source=source) fig.y_range = Range1d(min(0, 1.2 * min(init_data)), 1.2 * max(init_data)) return fig, source def create_hist_plot(init_data, title): source = ColumnDataSource(data=dict(hist=[], left_edge=[], right_edge=[])) init_hist, init_edge = np.histogram(init_data, density=True, bins=50) fig = figure(title=title, plot_width=300, plot_height=300) fig.quad(top=init_hist, bottom=0, left=init_edge[:-1], right=init_edge[1:], fill_alpha=0.05) fig.quad(top='hist', bottom=0, left='left_edge', right='right_edge', fill_alpha=0.3, source=source) return fig, source weights = [f for f in os.listdir(path) if f.startswith("weight")] weights = {w:{'init_weight': np.load(os.path.join(path, "init_" + w)).reshape(-1)} for w in weights} for name, w in weights.items(): w['bar_fig'], w['bar_source'] = create_bar_plot(w['init_weight'], name) w['hist_fig'], w['hist_source'] = create_hist_plot(w['init_weight'], name + "_histogram") bar_plots = [w['bar_fig'] for name, w in weights.items()] hist_plots = [w['hist_fig'] for name, w in weights.items()] layout = layout([[img_fig, score_fig], bar_plots, hist_plots], sizing_mode="fixed") curdoc().add_root(layout) def update(): score = np.load(os.path.join(path, "Scores.npy")) with open(os.path.join(path, "SMILES"), "r") as f: mols = [] scores = [] for line in f: line = line.split() mol = Chem.MolFromSmiles(line[0]) if mol and len(mols)<6: mols.append(mol) scores.append(line[1]) img = Draw.MolsToGridImage(mols, molsPerRow=3, legends=scores, subImgSize=(250,250), useSVG=True) img = img.replace("FFFFFF", "EDEDED") img_fig.text = '

Generated Molecules

' + '

' + img + '

' score_source.data = dict(x=score[0], y=score[1], y_mean=running_average(score[1], 50)) for name, w in weights.items(): current_weights = np.load(os.path.join(path, name)).reshape(-1) hist, edge = np.histogram(current_weights, density=True, bins=50) w['hist_source'].data = dict(hist=hist, left_edge=edge[:-1], right_edge=edge[1:]) current_weights = downsample(current_weights, 50) w['bar_source'].data = dict(x=range(len(current_weights)), y=current_weights) update() curdoc().add_periodic_callback(update, 1000) ================================================ FILE: Vizard/run.sh ================================================ #!/bin/bash if [ -z "$1" ]; then echo "Must supply path to a directory where vizard_logger is saving its information"; exit 0 fi bokeh serve . --args $1 ================================================ FILE: Vizard/templates/index.html ================================================ {{ bokeh_css }} {{ bokeh_js }} MolExplorer

Vizard

{{ plot_script|indent(8) }} ================================================ FILE: Vizard/templates/styles.css ================================================ html { background-color: #2F2F2F; display: table; margin: auto; } body { display: table-cell; vertical-align: middle; color: #fff; } .img_outside { position: relative; } .img_inside { background-color: #EDEDED; border: 7px solid #656565; position:absolute; left:50% ; margin-left: -375px; top:50% ; margin-top: -250px; } .score_fig{ position: absolute; top: 10px; } h1 { margin: 0.5em 0 0.5em 0; color: #fff; font-family: 'Julius Sans One', sans-serif; font-size: 3em; text-transform: uppercase; text-align: center; } h2 { margin: 0 0 0 0; color: #fff; font-size: 20pt; text-align: center; } a:link { font-weight: bold; text-decoration: none; color: #0d8ba1; } a:visited { font-weight: bold; text-decoration: none; color: #1a5952; } a:hover, a:focus, a:active { text-decoration: underline; color: #9685BA; } ================================================ FILE: Vizard/theme.yaml ================================================ attrs: Figure: background_fill_color: '#2F2F2F' border_fill_color: '#2F2F2F' outline_line_color: '#444444' min_border_top: 0 Axis: axis_line_color: "#FFFFFF" axis_label_text_color: "#FFFFFF" axis_label_text_font_size: "10pt" axis_label_text_font_style: "normal" axis_label_standoff: 10 major_label_text_color: "#FFFFFF" major_tick_line_color: "#FFFFFF" minor_tick_line_color: "#FFFFFF" minor_tick_line_color: "#FFFFFF" Grid: grid_line_dash: [6, 4] grid_line_alpha: .3 Title: text_color: "#FFFFFF" align: "center" ================================================ FILE: data/ChEMBL_filtered ================================================ [File too large to display: 53.3 MB] ================================================ FILE: data/Prior.ckpt ================================================ [File too large to display: 15.9 MB] ================================================ FILE: data/Voc ================================================ [S-] 9 ( S c [NH+] 3 [CH] o [NH3+] [nH] 7 6 [N] 1 O % [N-] 5 - [O+] [n+] [o+] [nH+] [NH2+] [N+] [O-] [S+] R F [n-] [s+] L s 8 4 [SH] 2 = n ) [O] N # [NH-] C [SH+] 0 ================================================ FILE: data/clf.pkl ================================================ [File too large to display: 33.8 MB] ================================================ FILE: data_structs.py ================================================ import numpy as np import random import re import pickle from rdkit import Chem import sys import time import torch from torch.utils.data import Dataset from utils import Variable class Vocabulary(object): """A class for handling encoding/decoding from SMILES to an array of indices""" def __init__(self, init_from_file=None, max_length=140): self.special_tokens = ['EOS', 'GO'] self.additional_chars = set() self.chars = self.special_tokens self.vocab_size = len(self.chars) self.vocab = dict(zip(self.chars, range(len(self.chars)))) self.reversed_vocab = {v: k for k, v in self.vocab.items()} self.max_length = max_length if init_from_file: self.init_from_file(init_from_file) def encode(self, char_list): """Takes a list of characters (eg '[NH]') and encodes to array of indices""" smiles_matrix = np.zeros(len(char_list), dtype=np.float32) for i, char in enumerate(char_list): smiles_matrix[i] = self.vocab[char] return smiles_matrix def decode(self, matrix): """Takes an array of indices and returns the corresponding SMILES""" chars = [] for i in matrix: if i == self.vocab['EOS']: break chars.append(self.reversed_vocab[i]) smiles = "".join(chars) smiles = smiles.replace("L", "Cl").replace("R", "Br") return smiles def tokenize(self, smiles): """Takes a SMILES and return a list of characters/tokens""" regex = '(\[[^\[\]]{1,6}\])' smiles = replace_halogen(smiles) char_list = re.split(regex, smiles) tokenized = [] for char in char_list: if char.startswith('['): tokenized.append(char) else: chars = [unit for unit in char] [tokenized.append(unit) for unit in chars] tokenized.append('EOS') return tokenized def add_characters(self, chars): """Adds characters to the vocabulary""" for char in chars: self.additional_chars.add(char) char_list = list(self.additional_chars) char_list.sort() self.chars = char_list + self.special_tokens self.vocab_size = len(self.chars) self.vocab = dict(zip(self.chars, range(len(self.chars)))) self.reversed_vocab = {v: k for k, v in self.vocab.items()} def init_from_file(self, file): """Takes a file containing \n separated characters to initialize the vocabulary""" with open(file, 'r') as f: chars = f.read().split() self.add_characters(chars) def __len__(self): return len(self.chars) def __str__(self): return "Vocabulary containing {} tokens: {}".format(len(self), self.chars) class MolData(Dataset): """Custom PyTorch Dataset that takes a file containing SMILES. Args: fname : path to a file containing \n separated SMILES. voc : a Vocabulary instance Returns: A custom PyTorch dataset for training the Prior. """ def __init__(self, fname, voc): self.voc = voc self.smiles = [] with open(fname, 'r') as f: for line in f: self.smiles.append(line.split()[0]) def __getitem__(self, i): mol = self.smiles[i] tokenized = self.voc.tokenize(mol) encoded = self.voc.encode(tokenized) return Variable(encoded) def __len__(self): return len(self.smiles) def __str__(self): return "Dataset containing {} structures.".format(len(self)) @classmethod def collate_fn(cls, arr): """Function to take a list of encoded sequences and turn them into a batch""" max_length = max([seq.size(0) for seq in arr]) collated_arr = Variable(torch.zeros(len(arr), max_length)) for i, seq in enumerate(arr): collated_arr[i, :seq.size(0)] = seq return collated_arr class Experience(object): """Class for prioritized experience replay that remembers the highest scored sequences seen and samples from them with probabilities relative to their scores.""" def __init__(self, voc, max_size=100): self.memory = [] self.max_size = max_size self.voc = voc def add_experience(self, experience): """Experience should be a list of (smiles, score, prior likelihood) tuples""" self.memory.extend(experience) if len(self.memory)>self.max_size: # Remove duplicates idxs, smiles = [], [] for i, exp in enumerate(self.memory): if exp[0] not in smiles: idxs.append(i) smiles.append(exp[0]) self.memory = [self.memory[idx] for idx in idxs] # Retain highest scores self.memory.sort(key = lambda x: x[1], reverse=True) self.memory = self.memory[:self.max_size] print("\nBest score in memory: {:.2f}".format(self.memory[0][1])) def sample(self, n): """Sample a batch size n of experience""" if len(self.memory)4: exp_seqs, exp_score, exp_prior_likelihood = experience.sample(4) exp_agent_likelihood, exp_entropy = Agent.likelihood(exp_seqs.long()) exp_augmented_likelihood = exp_prior_likelihood + sigma * exp_score exp_loss = torch.pow((Variable(exp_augmented_likelihood) - exp_agent_likelihood), 2) loss = torch.cat((loss, exp_loss), 0) agent_likelihood = torch.cat((agent_likelihood, exp_agent_likelihood), 0) # Then add new experience prior_likelihood = prior_likelihood.data.cpu().numpy() new_experience = zip(smiles, score, prior_likelihood) experience.add_experience(new_experience) # Calculate loss loss = loss.mean() # Add regularizer that penalizes high likelihood for the entire sequence loss_p = - (1 / agent_likelihood).mean() loss += 5 * 1e3 * loss_p # Calculate gradients and make an update to the network weights optimizer.zero_grad() loss.backward() optimizer.step() # Convert to numpy arrays so that we can print them augmented_likelihood = augmented_likelihood.data.cpu().numpy() agent_likelihood = agent_likelihood.data.cpu().numpy() # Print some information for this step time_elapsed = (time.time() - start_time) / 3600 time_left = (time_elapsed * ((n_steps - step) / (step + 1))) print("\n Step {} Fraction valid SMILES: {:4.1f} Time elapsed: {:.2f}h Time left: {:.2f}h".format( step, fraction_valid_smiles(smiles) * 100, time_elapsed, time_left)) print(" Agent Prior Target Score SMILES") for i in range(10): print(" {:6.2f} {:6.2f} {:6.2f} {:6.2f} {}".format(agent_likelihood[i], prior_likelihood[i], augmented_likelihood[i], score[i], smiles[i])) # Need this for Vizard plotting step_score[0].append(step + 1) step_score[1].append(np.mean(score)) # Log some weights logger.log(Agent.rnn.gru_2.weight_ih.cpu().data.numpy()[::100], "weight_GRU_layer_2_w_ih") logger.log(Agent.rnn.gru_2.weight_hh.cpu().data.numpy()[::100], "weight_GRU_layer_2_w_hh") logger.log(Agent.rnn.embedding.weight.cpu().data.numpy()[::30], "weight_GRU_embedding") logger.log(Agent.rnn.gru_2.bias_ih.cpu().data.numpy(), "weight_GRU_layer_2_b_ih") logger.log(Agent.rnn.gru_2.bias_hh.cpu().data.numpy(), "weight_GRU_layer_2_b_hh") logger.log("\n".join([smiles + "\t" + str(round(score, 2)) for smiles, score in zip \ (smiles[:12], score[:12])]), "SMILES", dtype="text", overwrite=True) logger.log(np.array(step_score), "Scores") # If the entire training finishes, we create a new folder where we save this python file # as well as some sampled sequences and the contents of the experinence (which are the highest # scored sequences seen during training) if not save_dir: save_dir = 'data/results/run_' + time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime()) os.makedirs(save_dir) copyfile('train_agent.py', os.path.join(save_dir, "train_agent.py")) experience.print_memory(os.path.join(save_dir, "memory")) torch.save(Agent.rnn.state_dict(), os.path.join(save_dir, 'Agent.ckpt')) seqs, agent_likelihood, entropy = Agent.sample(256) prior_likelihood, _ = Prior.likelihood(Variable(seqs)) prior_likelihood = prior_likelihood.data.cpu().numpy() smiles = seq_to_smiles(seqs, voc) score = scoring_function(smiles) with open(os.path.join(save_dir, "sampled"), 'w') as f: f.write("SMILES Score PriorLogP\n") for smiles, score, prior_likelihood in zip(smiles, score, prior_likelihood): f.write("{} {:5.2f} {:6.2f}\n".format(smiles, score, prior_likelihood)) if __name__ == "__main__": train_agent() ================================================ FILE: train_prior.py ================================================ #!/usr/bin/env python import torch from torch.utils.data import DataLoader import pickle from rdkit import Chem from rdkit import rdBase from tqdm import tqdm from data_structs import MolData, Vocabulary from model import RNN from utils import Variable, decrease_learning_rate rdBase.DisableLog('rdApp.error') def pretrain(restore_from=None): """Trains the Prior RNN""" # Read vocabulary from a file voc = Vocabulary(init_from_file="data/Voc") # Create a Dataset from a SMILES file moldata = MolData("data/mols_filtered.smi", voc) data = DataLoader(moldata, batch_size=128, shuffle=True, drop_last=True, collate_fn=MolData.collate_fn) Prior = RNN(voc) # Can restore from a saved RNN if restore_from: Prior.rnn.load_state_dict(torch.load(restore_from)) optimizer = torch.optim.Adam(Prior.rnn.parameters(), lr = 0.001) for epoch in range(1, 6): # When training on a few million compounds, this model converges # in a few of epochs or even faster. If model sized is increased # its probably a good idea to check loss against an external set of # validation SMILES to make sure we dont overfit too much. for step, batch in tqdm(enumerate(data), total=len(data)): # Sample from DataLoader seqs = batch.long() # Calculate loss log_p, _ = Prior.likelihood(seqs) loss = - log_p.mean() # Calculate gradients and take a step optimizer.zero_grad() loss.backward() optimizer.step() # Every 500 steps we decrease learning rate and print some information if step % 500 == 0 and step != 0: decrease_learning_rate(optimizer, decrease_by=0.03) tqdm.write("*" * 50) tqdm.write("Epoch {:3d} step {:3d} loss: {:5.2f}\n".format(epoch, step, loss.data[0])) seqs, likelihood, _ = Prior.sample(128) valid = 0 for i, seq in enumerate(seqs.cpu().numpy()): smile = voc.decode(seq) if Chem.MolFromSmiles(smile): valid += 1 if i < 5: tqdm.write(smile) tqdm.write("\n{:>4.1f}% valid SMILES".format(100 * valid / len(seqs))) tqdm.write("*" * 50 + "\n") torch.save(Prior.rnn.state_dict(), "data/Prior.ckpt") # Save the Prior torch.save(Prior.rnn.state_dict(), "data/Prior.ckpt") if __name__ == "__main__": pretrain() ================================================ FILE: utils.py ================================================ import torch import numpy as np from rdkit import Chem def Variable(tensor): """Wrapper for torch.autograd.Variable that also accepts numpy arrays directly and automatically assigns it to the GPU. Be aware in case some operations are better left to the CPU.""" if isinstance(tensor, np.ndarray): tensor = torch.from_numpy(tensor) if torch.cuda.is_available(): return torch.autograd.Variable(tensor).cuda() return torch.autograd.Variable(tensor) def decrease_learning_rate(optimizer, decrease_by=0.01): """Multiplies the learning rate of the optimizer by 1 - decrease_by""" for param_group in optimizer.param_groups: param_group['lr'] *= (1 - decrease_by) def seq_to_smiles(seqs, voc): """Takes an output sequence from the RNN and returns the corresponding SMILES.""" smiles = [] for seq in seqs.cpu().numpy(): smiles.append(voc.decode(seq)) return smiles def fraction_valid_smiles(smiles): """Takes a list of SMILES and returns fraction valid.""" i = 0 for smile in smiles: if Chem.MolFromSmiles(smile): i += 1 return i / len(smiles) def unique(arr): # Finds unique rows in arr and return their indices arr = arr.cpu().numpy() arr_ = np.ascontiguousarray(arr).view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[1]))) _, idxs = np.unique(arr_, return_index=True) if torch.cuda.is_available(): return torch.LongTensor(np.sort(idxs)).cuda() return torch.LongTensor(np.sort(idxs)) ================================================ FILE: vizard_logger.py ================================================ import numpy as np import os class VizardLog(): def __init__(self, log_dir): self.log_dir = log_dir if not os.path.exists(log_dir): os.makedirs(log_dir) # List of variables to log self.logged_vars = [] # Dict of {name_of_variable : time_since_last_logged} self.last_logged = {} # Dict of [name_of_variable : log_every} self.log_every = {} self.overwrite = {} def log(self, data, name, dtype="array", log_every=1, overwrite=False): if name not in self.logged_vars: self.logged_vars.append(name) self.last_logged[name] = 1 self.log_every[name] = log_every if overwrite: self.overwrite[name] = 'w' else: self.overwrite[name] = 'a' if self.last_logged[name] == self.log_every[name]: out_f = os.path.join(self.log_dir, name) if dtype=="text": with open(out_f, self.overwrite[name]) as f: f.write(data) elif dtype=="array": np.save(out_f, data) elif dtype=="hist": np.save(out_f, np.histogram(data, density=True, bins=50))