datamol.fragment
¶
Molecule fragmentation¶
anybreak(mol, remove_parent=False, sanitize=True, fix=True)
¶
Fragment molecule by applying brics first, then fall back to frag.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
remove_parent |
bool |
Remove parent from the fragments. |
False |
sanitize |
bool |
Wether to sanitize the fragments. |
True |
fix |
bool |
Wether to fix the fragments. |
True |
Source code in datamol/fragment/_fragment.py
def anybreak(
mol: Chem.rdchem.Mol,
remove_parent: bool = False,
sanitize: bool = True,
fix: bool = True,
):
"""Fragment molecule by applying brics first, then fall back to frag.
Args:
mol: a molecule.
remove_parent: Remove parent from the fragments.
sanitize: Wether to sanitize the fragments.
fix: Wether to fix the fragments.
"""
frags = []
try:
frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
except:
pass
if len(frags) == 0:
frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)
return frags
brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)
¶
Run BRICS on the molecules and potentially fix dummy atoms.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
singlepass |
bool |
Single pass for |
True |
remove_parent |
bool |
Remove parent from the fragments. |
False |
sanitize |
bool |
Wether to sanitize the fragments. |
True |
fix |
bool |
Wether to fix the fragments. |
True |
Source code in datamol/fragment/_fragment.py
def brics(
mol: Chem.rdchem.Mol,
singlepass: bool = True,
remove_parent: bool = False,
sanitize: bool = True,
fix: bool = True,
):
"""Run BRICS on the molecules and potentially fix dummy atoms.
Args:
mol: a molecule.
singlepass: Single pass for `BRICSDecompose`.
remove_parent: Remove parent from the fragments.
sanitize: Wether to sanitize the fragments.
fix: Wether to fix the fragments.
"""
frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
frags = list(frags)
if fix:
frags = [dm.fix_mol(x) for x in frags]
if sanitize:
frags = [dm.sanitize_mol(x) for x in frags]
if remove_parent:
frags.pop(0)
frags = [x for x in frags if x is not None]
return frags
frag(mol, remove_parent=False, sanitize=True, fix=True)
¶
Generate all possible fragmentation of a molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
remove_parent |
bool |
Remove parent from the fragments. |
False |
sanitize |
bool |
Wether to sanitize the fragments. |
True |
fix |
bool |
Wether to fix the fragments. |
True |
Source code in datamol/fragment/_fragment.py
def frag(
mol: Chem.rdchem.Mol,
remove_parent: bool = False,
sanitize: bool = True,
fix: bool = True,
):
"""Generate all possible fragmentation of a molecule.
Args:
mol: a molecule.
remove_parent: Remove parent from the fragments.
sanitize: Wether to sanitize the fragments.
fix: Wether to fix the fragments.
"""
frags = FraggleSim.generate_fraggle_fragmentation(mol)
smiles = set([])
for seq in frags:
smiles |= {s.strip() for s in seq.split(".")}
smiles = list(sorted(smiles, reverse=True))
frags = [dm.to_mol(s) for s in smiles]
if fix:
frags = [dm.fix_mol(x) for x in frags]
if sanitize:
frags = [dm.sanitize_mol(x) for x in frags]
frags = [x for x in frags if x is not None]
if remove_parent:
return frags
return [mol] + frags
recap(mol, remove_parent=False, sanitize=True, fix=True)
¶
Fragment the molecule using the recap algorithm.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
remove_parent |
bool |
Remove parent from the fragments. |
False |
sanitize |
bool |
Wether to sanitize the fragments. |
True |
fix |
bool |
Wether to fix the fragments. |
True |
Source code in datamol/fragment/_fragment.py
def recap(
mol: Chem.rdchem.Mol,
remove_parent: bool = False,
sanitize: bool = True,
fix: bool = True,
):
"""Fragment the molecule using the recap algorithm.
Args:
mol: a molecule.
remove_parent: Remove parent from the fragments.
sanitize: Wether to sanitize the fragments.
fix: Wether to fix the fragments.
"""
res = Recap.RecapDecompose(mol)
frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]
if fix:
frags = [dm.fix_mol(x) for x in frags]
if sanitize:
frags = [dm.sanitize_mol(x) for x in frags]
frags = [x for x in frags if x is not None]
if remove_parent:
return frags
return [mol] + frags
MMPA fragmentation¶
mmpa_cut(mol, rdkit_pattern=False)
¶
Cut molecules to perform mmpa analysis later
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
Molecule to fragment. |
required |
rdkit_pattern |
bool |
Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(=, #[!#6])]!@!=!#[]" |
False |
Returns:
Type | Description |
---|---|
Optional[Set[Any]] |
List of 'smiles,core,chains' |
Source code in datamol/fragment/_fragment.py
def mmpa_cut(mol: Chem.rdchem.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
"""Cut molecules to perform mmpa analysis later
Args:
mol: Molecule to fragment.
rdkit_pattern: Whether to perform the fragmentation
using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"
Returns:
List of 'smiles,core,chains'
"""
if mol is None:
return mol
outlines = set()
smiles = dm.to_smiles(mol)
if rdkit_pattern:
frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
else:
# heavy atoms
frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))
frags = set(frags)
for core, chains in frags:
output = f"{smiles},{core},{chains}\n"
outlines.add(output)
# hydrogen splitting
mol = Chem.AddHs(mol)
smiles = dm.to_smiles(mol)
n = mol.GetNumHeavyAtoms()
if n < 60:
frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
for core, chains in frags:
output = f"{smiles},{core},{chains}\n"
outlines.add(output)
return outlines
mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)
¶
Fragment molecule on specific bonds suitable for a MMPA analysis.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
|
Molecule to fragment. |
required |
pattern |
str |
Bond pattern to split on. Will use default rdkit pattern '[#6+0;!$(=,#[!#6])]!@!=!#[]' if not provided. |
None |
max_cut |
int |
Number of cuts. |
1 |
max_bond_cut |
int |
Maximum number of bond to cut. Default to 20. |
20 |
h_split |
bool |
Whether to split at hydrogen position too. This is equivalent to enabling the addition of new fragments. |
False |
Returns:
Type | Description |
---|---|
Optional[Set[rdkit.Chem.rdchem.Mol]] |
List of fragments |
Source code in datamol/fragment/_fragment.py
def mmpa_frag(
mol,
pattern: str = None,
max_cut: int = 1,
max_bond_cut: int = 20,
h_split: bool = False,
) -> Optional[Set[Chem.Mol]]:
"""Fragment molecule on specific bonds suitable for a MMPA analysis.
Args:
mol: Molecule to fragment.
pattern: Bond pattern to split on. Will use default rdkit pattern
'[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
max_cut: Number of cuts.
max_bond_cut: Maximum number of bond to cut. Default to 20.
h_split: Whether to split at hydrogen position too.
This is equivalent to enabling the addition of new fragments.
Returns:
List of fragments
"""
frags = []
if pattern is None:
frags = rdMMPA.FragmentMol(
mol,
maxCuts=max_cut,
resultsAsMols=False,
maxCutBonds=max_bond_cut,
)
elif pattern:
frags = rdMMPA.FragmentMol(
mol,
pattern=pattern,
maxCuts=max_cut,
resultsAsMols=False,
maxCutBonds=max_bond_cut,
)
if h_split:
mol = Chem.AddHs(mol)
frags += rdMMPA.FragmentMol(
mol,
pattern="[#1]!@!=!#[!#1]",
maxCuts=1,
resultsAsMols=False,
maxCutBonds=max_bond_cut,
)
return set(frags)
Molecule assembly¶
assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=inf, maxdepth=3, as_smiles=True, RXNS=None)
¶
Perform an assembly from fragment given all potential RXNS transformation.
Source code in datamol/fragment/_assemble.py
def assemble_fragment_iter(
fragmentlist,
seens=None,
scrambleReagents=False,
max_n_mols=float("inf"),
maxdepth=3,
as_smiles=True,
RXNS=None,
):
"""Perform an assembly from fragment given all potential RXNS transformation."""
if RXNS is None:
RXNS = ALL_BRICS_RETRO
seen = set()
if max_n_mols <= 0:
return
if not seens:
seens = list(fragmentlist)
if scrambleReagents:
seens = list(seens)
random.shuffle(seens, random=random.random)
for seen in seens:
nextSteps = []
for rxn in RXNS:
for fg in fragmentlist:
for m, pSmi in _run_at_all_rct(rxn, fg, seen):
if pSmi not in seen:
seen.add(pSmi)
yield m if not as_smiles else pSmi
if _can_continue_with(m, rxn):
nextSteps.append(m)
if nextSteps and len(seen) <= max_n_mols and maxdepth > 0:
for p in assemble_fragment_iter(
fragmentlist,
seens=nextSteps,
scrambleReagents=scrambleReagents,
max_n_mols=(max_n_mols - len(seen)),
maxdepth=maxdepth - 1,
):
pSmi = Chem.MolToSmiles(p, True)
if pSmi not in seen:
seen.add(pSmi)
yield p if not as_smiles else pSmi
if len(seen) >= max_n_mols:
return
assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=inf, RXNS=None)
¶
Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm
..note :: We are of course assuming: 1. that the order in the fragmentlist matter :D ! 2. that none of the fragment has explicitly defined hydrogen atoms. 3. only a list of unique molecule is internally maintained
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fragmentlist |
|
list of original fragments to grow |
required |
seen |
|
original molecules used as base. If none, the first element of fragment list will be poped out |
None |
allow_incomplete |
bool |
Whether to accept assembled molecules with missing fragment |
False |
Source code in datamol/fragment/_assemble.py
def assemble_fragment_order(
fragmentlist,
seen=None,
allow_incomplete: bool = False,
max_n_mols: float = float("inf"),
RXNS=None,
):
"""Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm
..note ::
We are of course assuming:
1. that the order in the fragmentlist matter :D !
2. that none of the fragment has explicitly defined hydrogen atoms.
3. only a list of unique molecule is internally maintained
Args:
fragmentlist: list of original fragments to grow
seen: original molecules used as base. If none, the first element of fragment list will be poped out
allow_incomplete: Whether to accept assembled molecules with missing fragment
"""
if RXNS is None:
RXNS = ALL_BRICS_RETRO
fragmentlist = list(fragmentlist)
yield_counter = 0
if seen is None:
seen = fragmentlist.pop(0)
seen = [Chem.MolToSmiles(seen)] # only one molecule to assemble
while yield_counter < max_n_mols and len(fragmentlist) > 0:
# find all the way to add this fragment to seen
frag = fragmentlist.pop(0)
level_set = [dm.to_mol(x) for x in seen]
seen = set()
for sm in level_set:
try:
# there is no point in even trying something on molecules that cannot be kekulized
for rxn in RXNS:
for m, mSmi in _run_at_all_rct(rxn, frag, sm):
if allow_incomplete and mSmi not in seen:
yield m
yield_counter += 1
seen.add(mSmi)
except Exception as e:
print(e)
pass
for m in seen:
if yield_counter < max_n_mols:
yield dm.to_mol(m)
yield_counter += 1
break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)
¶
Breaks a molecules into a list of fragment.
Source code in datamol/fragment/_assemble.py
def break_mol(
mol: Chem.rdchem.Mol,
minFragmentSize: int = 1,
silent: bool = True,
onlyUseReactions: list = [],
randomize: bool = False,
mode: str = "brics",
returnTree: bool = False,
):
"""Breaks a molecules into a list of fragment."""
if mode.lower() == "brics":
all_reactions = ALL_BRICS
all_reactions_type = ALL_BRICS_TYPE
elif mode.lower() == "rxn":
all_reactions = ALL_RXNS
all_reactions_type = ALL_RXNS_TYPE
else:
all_reactions = ALL_BRICS + ALL_RXNS
all_reactions_type = ALL_BRICS_TYPE + ALL_RXNS_TYPE
if randomize:
p = np.random.permutation(len(all_reactions))
all_reactions = [all_reactions[ind] for ind in p]
all_reactions_type = [all_reactions_type[ind] for ind in p]
nx = dm.graph._get_networkx()
mSmi = Chem.MolToSmiles(mol, isomericSmiles=True)
G = nx.DiGraph()
node_num = 0
G.add_node(node_num, smiles=mSmi, mol=mol)
allNodes = set()
activePool = {mSmi: node_num}
allNodes.add(mSmi)
while activePool:
nSmi = list(activePool.keys())[0]
parent = activePool.pop(nSmi)
node = G.nodes[parent]
mol = node["mol"]
for rxnIdx, reaction in zip(all_reactions_type, all_reactions):
if onlyUseReactions and rxnIdx not in onlyUseReactions:
continue
ps = reaction.RunReactants((mol,))
if ps:
all_pass = [
all([prod.GetNumAtoms(onlyExplicit=True) > minFragmentSize for prod in p_])
for p_ in ps
]
nz_i = 0
while nz_i < len(all_pass) and not all_pass[nz_i]:
nz_i += 1
if not silent:
print(nSmi, "->", len(ps), "products and selected ", nz_i)
# display(MolsToGridImage(list(itertools.chain(*list(ps))), molsPerRow=2))
prodSeq = ps[nz_i % len(all_pass)]
seqOk = True
# we want to disqualify small fragments, so sort the product sequence by size
prodSeq = [(prod.GetNumAtoms(onlyExplicit=True), prod) for prod in prodSeq]
prodSeq.sort(key=lambda x: x[0])
for _, prod in prodSeq:
prod.sanitized = True
try:
Chem.SanitizeMol(prod)
except:
if dm.sanitize_mol(prod) is None:
seqOk = False
break
continue
pSmi = Chem.MolToSmiles(prod, isomericSmiles=True)
seqOk = seqOk and (dm.to_mol(pSmi) is not None)
notDummies = sum([atm.GetSymbol() != "*" for atm in prod.GetAtoms()])
# nDummies = pSmi.count('*')
# if minFragmentSize > 0 and (nats - nDummies < minFragmentSize):
if minFragmentSize > 0 and notDummies < minFragmentSize:
seqOk = False
break
prod.pSmi = pSmi
if seqOk:
for _, prod in prodSeq:
if not prod.sanitized:
continue
pSmi = prod.pSmi
node_num += 1
usmi = Chem.MolToSmiles(dm.fix_mol(prod), isomericSmiles=True)
G.add_node(node_num, smiles=usmi, mol=prod)
G.add_edge(parent, node_num)
if usmi not in allNodes:
activePool[pSmi] = node_num
allNodes.add(usmi)
G.nodes[parent]["rxn"] = rxnIdx
break # at least one reaction matches
leaves_smiles = [
G.nodes[n]["smiles"] for n in G.nodes() if G.in_degree(n) != 0 and G.out_degree(n) == 0
]
if returnTree:
return leaves_smiles, allNodes, G
return leaves_smiles, allNodes
build(ll_mols, max_n_mols=inf, mode='brics', frag_rxn=None, ADD_RNXS=[])
¶
Build a super molecule from a list of fragments
Source code in datamol/fragment/_assemble.py
def build(ll_mols, max_n_mols=float("inf"), mode="brics", frag_rxn=None, ADD_RNXS=[]):
"""Build a super molecule from a list of fragments"""
seen = set()
stop = False
CUR_RXNS = []
CUR_RXNS_TYPE = []
if mode == "brics":
CUR_RXNS = ALL_BRICS_RETRO
CUR_RXNS_TYPE = ALL_BRICS_TYPE
elif mode == "rxn":
CUR_RXNS = ALL_RXNS_RETRO
CUR_RXNS_TYPE = ALL_RXNS_TYPE
elif mode is not None:
CUR_RXNS = ALL_BRICS_RETRO + ALL_RXNS_RETRO
CUR_RXNS_TYPE = ALL_BRICS_TYPE + ALL_RXNS_TYPE
if ADD_RNXS is not None:
ADD_RNXS_TYPE = [f"RXN-{i}" for i in range(len(ADD_RNXS))]
if isinstance(ADD_RNXS, dict):
ADD_RNXS_TYPE = ADD_RNXS.keys()
ADD_RNXS = ADD_RNXS.values()
CUR_RXNS += list(ADD_RNXS)
CUR_RXNS_TYPE += list(ADD_RNXS_TYPE)
for i, rxn_type in enumerate(CUR_RXNS_TYPE):
if (frag_rxn is not None) and (frag_rxn.strip('"') == rxn_type):
CUR_RXNS = [CUR_RXNS[i]]
break
for fraglist in itertools.product(*ll_mols):
if stop:
break
fraglist = list(fraglist)
for rxn in CUR_RXNS: # should be size==1 if frag_rxn is provided
ps = []
try:
ps = _run_at_all_rct(rxn, fraglist[0], fraglist[1])
except Exception:
pass
for m, mSmi in ps:
if len(seen) >= max_n_mols:
stop = True
break
if mSmi not in seen:
seen.add(mSmi)
yield m