DiffLinker/generate_with_pocket.py at main · igashov/DiffLinker

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

import argparse

import os

import numpy as np

import torch

import subprocess

from rdkit import Chem

from Bio.PDB import PDBParser

from src import const

from src.datasets import (

collate_with_fragment_without_pocket_edges, get_dataloader, get_one_hot, parse_molecule, MOADDataset

)

from src.lightning import DDPM

from src.visualizer import save_xyz_file

from src.utils import FoundNaNException, set_deterministic

from tqdm import tqdm

from src.linker_size_lightning import SizeClassifier

from pdb import set_trace

parser = argparse.ArgumentParser()

parser.add_argument(

'--fragments', action='store', type=str, required=True,

help='Path to the file with input fragments'

)

parser.add_argument(

'--pocket', action='store', type=str, required=True,

help='Path to the file with pocket atoms'

)

parser.add_argument(

'--backbone_atoms_only', action='store_true', required=False, default=False,

help='Flag if to use only protein backbone atoms'

)

parser.add_argument(

'--model', action='store', type=str, required=True,

help='Path to the DiffLinker model'

)

parser.add_argument(

'--linker_size', action='store', type=str, required=True,

help='Linker size (int) or allowed size boundaries (comma-separated integers) or path to the size prediction model'

)

parser.add_argument(

'--output', action='store', type=str, required=False, default='./',

help='Directory where sampled molecules will be saved'

)

parser.add_argument(

'--n_samples', action='store', type=int, required=False, default=5,

help='Number of linkers to generate'

)

parser.add_argument(

'--n_steps', action='store', type=int, required=False, default=None,

help='Number of denoising steps'

)

parser.add_argument(

'--anchors', action='store', type=str, required=False, default=None,

help='Comma-separated indices of anchor atoms '

'(according to the order of atoms in the input fragments file, enumeration starts with 1)'

)

parser.add_argument(

'--max_batch_size', action='store', type=int, required=False, default=64,

help='Max batch size'

)

parser.add_argument(

'--random_seed', action='store', type=int, required=False, default=None,

help='Random seed'

)

def read_molecule(path):

if path.endswith('.pdb'):

return Chem.MolFromPDBFile(path, sanitize=False, removeHs=True)

elif path.endswith('.mol'):

return Chem.MolFromMolFile(path, sanitize=False, removeHs=True)

elif path.endswith('.mol2'):

return Chem.MolFromMol2File(path, sanitize=False, removeHs=True)

elif path.endswith('.sdf'):

return Chem.SDMolSupplier(path, sanitize=False, removeHs=True)[0]

raise Exception('Unknown file extension')

def read_pocket(path):

pocket_coords_full = []

pocket_types_full = []

pocket_coords_bb = []

pocket_types_bb = []

struct = PDBParser().get_structure('', path)

for residue in struct.get_residues():

for atom in residue.get_atoms():

atom_name = atom.get_name()

atom_type = atom.element.upper()

atom_coord = atom.get_coord()

pocket_coords_full.append(atom_coord.tolist())

pocket_types_full.append(atom_type)

if atom_name == 'H':

continue

if atom_name in {'N', 'CA', 'C', 'O'}:

pocket_coords_bb.append(atom_coord.tolist())

pocket_types_bb.append(atom_type)

return {

'full_coord': np.array(pocket_coords_full),

'full_types': np.array(pocket_types_full),

'bb_coord': np.array(pocket_coords_bb),

'bb_types': np.array(pocket_types_bb),

}

def main(input_path, pocket_path, backbone_atoms_only, model,

output_dir, n_samples, n_steps, linker_size, anchors, max_batch_size, random_seed):

# Setup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(output_dir, exist_ok=True)

if random_seed is not None:

set_deterministic(random_seed)

if linker_size.isdigit():

print(f'Will generate linkers with {linker_size} atoms')

linker_size = int(linker_size)

def sample_fn(_data):

return torch.ones(_data['positions'].shape[0], device=device, dtype=const.TORCH_INT) * linker_size

else:

boundaries = [x.strip() for x in linker_size.split(',')]

if len(boundaries) == 2 and boundaries[0].isdigit() and boundaries[1].isdigit():

left = int(boundaries[0])

right = int(boundaries[1])

print(f'Will generate linkers with numbers of atoms sampled from U({left}, {right})')

def sample_fn(_data):

shape = len(_data['positions']),

return torch.randint(left, right + 1, shape, device=device, dtype=const.TORCH_INT)

else:

print(f'Will generate linkers with sampled numbers of atoms')

size_nn = SizeClassifier.load_from_checkpoint(linker_size, map_location=device).eval().to(device)

def sample_fn(_data):

out, _ = size_nn.forward(_data, return_loss=False, with_pocket=True)

probabilities = torch.softmax(out, dim=1)

distribution = torch.distributions.Categorical(probs=probabilities)

samples = distribution.sample()

sizes = []

for label in samples.detach().cpu().numpy():

sizes.append(size_nn.linker_id2size[label])

sizes = torch.tensor(sizes, device=samples.device, dtype=const.TORCH_INT)

return sizes

ddpm = DDPM.load_from_checkpoint(model, map_location=device).eval().to(device)

if n_steps is not None:

ddpm.edm.T = n_steps

if ddpm.center_of_mass == 'anchors' and anchors is None:

print(

'Please pass anchor atoms indices '

'or use another DiffLinker model that does not require information about anchors'

)

return

# Reading input fragments

extension = input_path.split('.')[-1]

if extension not in ['sdf', 'pdb', 'mol', 'mol2']:

print('Please upload the fragments file in one of the following formats: .pdb, .sdf, .mol, .mol2')

return

pocket_extension = pocket_path.split('.')[-1]

if pocket_extension != 'pdb':

print('Please upload the pocket file in .pdb format')

return

try:

molecule = read_molecule(input_path)

molecule = Chem.RemoveAllHs(molecule)

name = '.'.join(input_path.split('/')[-1].split('.')[:-1])

except Exception as e:

print(f'Could not read the file with fragments: {e}')

return

try:

pocket_data = read_pocket(pocket_path)

except Exception as e:

print(f'Could not read the file with pocket: {e}')

return

# Parsing fragments data

frag_pos, frag_one_hot, frag_charges = parse_molecule(molecule, is_geom=ddpm.is_geom)

# Parsing pocket data

pocket_mode = 'bb' if backbone_atoms_only else 'full'

pocket_pos = pocket_data[f'{pocket_mode}_coord']

pocket_one_hot = []

pocket_charges = []

for atom_type in pocket_data[f'{pocket_mode}_types']:

pocket_one_hot.append(get_one_hot(atom_type, const.GEOM_ATOM2IDX))

pocket_charges.append(const.GEOM_CHARGES[atom_type])

pocket_one_hot = np.array(pocket_one_hot)

pocket_charges = np.array(pocket_charges)

positions = np.concatenate([frag_pos, pocket_pos], axis=0)

one_hot = np.concatenate([frag_one_hot, pocket_one_hot], axis=0)

charges = np.concatenate([frag_charges, pocket_charges], axis=0)

anchor_flags = np.zeros_like(charges)

if anchors is not None:

for anchor in anchors.split(','):

anchor_flags[int(anchor.strip()) - 1] = 1

fragment_only_mask = np.concatenate([

np.ones_like(frag_charges),

np.zeros_like(pocket_charges),

])

pocket_mask = np.concatenate([

np.zeros_like(frag_charges),

np.ones_like(pocket_charges),

])

linker_mask = np.concatenate([

np.zeros_like(frag_charges),

np.zeros_like(pocket_charges),

])

fragment_mask = np.concatenate([

np.ones_like(frag_charges),

np.ones_like(pocket_charges),

])

dataset = [{

'uuid': '0',

'name': '0',

'positions': torch.tensor(positions, dtype=const.TORCH_FLOAT, device=device),

'one_hot': torch.tensor(one_hot, dtype=const.TORCH_FLOAT, device=device),

'charges': torch.tensor(charges, dtype=const.TORCH_FLOAT, device=device),

'anchors': torch.tensor(anchor_flags, dtype=const.TORCH_FLOAT, device=device),

'fragment_only_mask': torch.tensor(fragment_only_mask, dtype=const.TORCH_FLOAT, device=device),

'pocket_mask': torch.tensor(pocket_mask, dtype=const.TORCH_FLOAT, device=device),

'fragment_mask': torch.tensor(fragment_mask, dtype=const.TORCH_FLOAT, device=device),

'linker_mask': torch.tensor(linker_mask, dtype=const.TORCH_FLOAT, device=device),

'num_atoms': len(positions),

}] * n_samples

dataset = MOADDataset(data=dataset)

ddpm.val_dataset = dataset

global_batch_size = min(n_samples, max_batch_size)

dataloader = get_dataloader(

dataset, batch_size=global_batch_size, collate_fn=collate_with_fragment_without_pocket_edges

)

# Sampling

print('Sampling...')

for batch_i, data in tqdm(enumerate(dataloader), total=len(dataloader)):

batch_size = len(data['positions'])

chain = None

for i in range(5):

try:

chain, node_mask = ddpm.sample_chain(data, sample_fn=sample_fn, keep_frames=1)

break

except FoundNaNException:

continue

if chain is None:

raise Exception('Could not generate in 5 attempts')

x = chain[0][:, :, :ddpm.n_dims]

h = chain[0][:, :, ddpm.n_dims:]

# Put the molecule back to the initial orientation

com_mask = data['fragment_only_mask'] if ddpm.center_of_mass == 'fragments' else data['anchors']

pos_masked = data['positions'] * com_mask

N = com_mask.sum(1, keepdims=True)

mean = torch.sum(pos_masked, dim=1, keepdim=True) / N

x = x + mean * node_mask

offset_idx = batch_i * global_batch_size

names = [f'output_{offset_idx+i}_{name}' for i in range(batch_size)]

node_mask[torch.where(data['pocket_mask'])] = 0

save_xyz_file(output_dir, h, x, node_mask, names=names, is_geom=ddpm.is_geom, suffix='')

for i in range(batch_size):

out_xyz = f'{output_dir}/output_{offset_idx+i}_{name}_.xyz'

out_sdf = f'{output_dir}/output_{offset_idx+i}_{name}_.sdf'

subprocess.run(f'obabel {out_xyz} -O {out_sdf} 2> /dev/null', shell=True)

print(f'Saved generated molecules in .xyz and .sdf format in directory {output_dir}')

if __name__ == '__main__':

args = parser.parse_args()

main(

input_path=args.fragments,

pocket_path=args.pocket,

backbone_atoms_only=args.backbone_atoms_only,

model=args.model,

output_dir=args.output,

n_samples=args.n_samples,

n_steps=args.n_steps,

linker_size=args.linker_size,

anchors=args.anchors,

max_batch_size=args.max_batch_size,

random_seed=args.random_seed,

)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

generate_with_pocket.py

generate_with_pocket.py

Files

generate_with_pocket.py

Latest commit

History

generate_with_pocket.py

File metadata and controls