Source code for gaddlemaps._alignment

# -*- coding: utf-8 -*-
#    Gaddlemaps python module.
#    Copyright (C) 2019-2021 José Manuel Otero Mato, Hadrián Montes Campos, Luis Miguel Varela Cabo
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
'''
This module contains features to manage the alignment engine.
'''

from typing import Any, List, Optional, Tuple, TYPE_CHECKING

import numpy as np

from . import ExchangeMap, minimize_molecules, Restriction
from .components import Molecule, are_connected, Residue
from .parsers import open_coordinate_file

if TYPE_CHECKING:
    from ipywidgets import Widget


[docs]class Alignment:
    """
    Class to manage the molecules alignment.

    This class serve as interface during the molecular alignment process. It
    will take two molecules in different representations (e.g. coarse grained
    and atomistic), one of them will be taken as initial representation and the
    other as the final one. These molecules can be passed as arguments in the
    object initialization or assigned after its creation.

    NOTE: Once both molecules are assigned you can only set them again with the
    same molecule type (although it may have different position).

    Once the molecules are fixed you can call the "aling_molecules" method to
    find the optimum overlap between both representation. Then, you should
    call the "init_exchange_map" method to generate the map to change between
    resolution for other start molecules configurations. You can also call
    the "write_comparative_gro" method to write a .gro file with the two
    aligned molecules with different residue names to visualize the found
    overlap in other visualization programs.

    Parameters
    ----------
    start : Molecule
        The molecule to align in the initial resolution.
    end : Molecule
        The molecule to align in the final resolution.

    Attributes
    ----------
    exchange_map : ExchangeMap
        The map to change from initial resolution to the end one. It has to be
        initialized with the init_exchange_map method.
    SIGMA_SCALE : float
        A number that modulates the molecule displacements.
    STEPS_FACTOR : int
        The factor to apply in the calculation of the number of steps in the
        Monte-Carlo alignment.

    """

    SIGMA_SCALE = 0.5
    STEPS_FACTOR = 5000

    def __init__(self, start: Molecule = None,
                 end: Molecule = None):
        super(Alignment, self).__init__()
        self._start: Optional[Molecule] = None
        self._end: Optional[Molecule] = None
        self.start = start
        self.end = end
        self.exchange_map: Optional[ExchangeMap] = None

    @property
    def start(self) -> Optional[Molecule]:
        """
        Molecule : The molecule in the initial resolution.

        Setter Raises
        -------------
        TypeError
            If the molecule is not instance of Molecule
        ValueError
            If the molecule does not match with the end.
        """
        return self._start

    @start.setter
    def start(self, molecule: Optional[Molecule]):
        if molecule is None:
            self._start = None
            return
        if not isinstance(molecule, Molecule):
            raise TypeError(('The start attribute has to be an instance of'
                             ' Molecule, not {}.'.format(type(molecule))))
        if (self._end is None) or (self._start is None):
            self._start = molecule.copy()
        else:
            if molecule == self._start:
                self._start = molecule.copy()
            else:
                raise ValueError(('The start molecule do not match with the'
                                  ' end one. Pleas check that both molecules'
                                  ' are the same.'))

    @property
    def end(self) -> Optional[Molecule]:
        """
        Molecule : The molecule in the final resolution.

        Setter Raises
        -------------
        TypeError
            If the molecule is not instance of Molecule
        ValueError
            If the molecule does not match with the start.
        """
        return self._end

    @end.setter
    def end(self, molecule: Optional[Molecule]):
        if molecule is None:
            self._end = None
            return
        if not isinstance(molecule, Molecule):
            raise TypeError(('The end attribute has to be an instance of'
                             ' Molecule, not {}.'.format(type(molecule))))
        if (self._start is None) or (self._end is None):
            self._end = molecule.copy()
        else:
            if molecule == self._end:
                self._end = molecule.copy()
            else:
                raise ValueError(('The end molecule do not match with the'
                                  ' end one. Pleas check that both molecules'
                                  ' are the same.'))

[docs]    def align_molecules(self, restrictions: List[Tuple[int, int]] = None,
                        deformation_types: Tuple[int, ...] = None,
                        ignore_hydrogens: bool = True,
                        auto_guess_protein_restrictions: bool = True):
        """
        Starts the alignment engine to find the optimal overlap between molecs.

        NOTE: If None is input as restrictions and the molecules to align
        have multiple residues and also the auto_guess_protein_restrictions
        parameter is True, they will be guessed by residue matching (see
        guess_protein_restrains function). However take into account that the
        time taken by this step scales with the square of the number of atoms if
        no restrictions are given and linearly if they are guessed for all the
        atoms by residue matching.

        If you know that the molecules in both resolution are in the same
        configuration you may want to perform the alignment just rotating and
        translating the molecules as a whole and avoid molecular deformation by
        setting the "deformation_types" parameter to (0, 1). This will translate
        in a better performance.

        On the other hand, in the most of the cases, the hydrogen atoms are not
        very relevant atoms for the alignment so they are by default (see
        "ignore_hydrogens" parameter) not taken into account in the distance
        minimization process when they belong to the molecule with larger number
        of atoms (the one that will remain still in the process).

        Parameters
        ----------
        restrictions : list of tuple of int, optional
            A list of tuples with pairs of atom numbers corresponding to
            start and end atoms indexes in the molecules. The align will be
            performed privileging configurations where those atoms are close.
            By default is set to [].

            Example:
            >>> restrictions = [(1, 3), (4, 5)]

            **IMPORTANT INDEX ARE REFERENCED TO THE ATOM INDEX IN THE MOLECULE
            (STARTS IN 0).**

        deformation_types : tuple of int, optional
            Specifies the type of the minimization. Possible options:
                0 : Translation
                1 : Rotation
                2 : Individual atom move
            If it is None, all the possibilities are chosen.
        ignore_hydrogens : bool, optional
            If True, hydrogen atoms will not be included in the minimization
            of the distances. This will be only applied to the molecule which
            is not moved in the alignment engine.
        auto_guess_protein_restrictions : bool, optional
            If True automatic restrictions will try to be guessed if the
            molecules to be aligned have multiple residues.

        Raises
        ------
        IOError
            If the automatic restrictions guess failed. In that case, consider
            to set "auto_guess_protein_restrictions" parameter to False.

        """

        if self.start is None or self.end is None:
            raise ValueError("Start and End molecules must be set before the aligment")

        if restrictions is None:
            # Check for multiple residues
            restrictions = []
            if len(self.start.resnames) > 1 and auto_guess_protein_restrictions:
                try:
                    restrictions = guess_protein_restrains(self.start,
                                                           self.end)
                except IOError:
                    raise IOError(('Automatic restrictions can not be guessed.'
                                   ' Try to set '
                                   'auto_guess_protein_restrictions to '
                                   'False.'))

        if deformation_types is None:
            if len(self.start) == 1 or len(self.end) == 1:
                deformation_types = (0, )
            else:
                deformation_types = (0, 1, 2)

        # Move molecules to share geometric center
        self.start.move_to(self.end.geometric_center)
        # Identify the smallest molecule
        if len(self.start) < len(self.end):
            molecules = [self.end, self.start]
            restrictions = [i[::-1] for i in restrictions]
        else:
            molecules = [self.start, self.end]
        if len(self.end) == 1:
            return
        # Check connectivity
        if not are_connected(molecules[1].atoms):
            raise IOError((f'Problem with {molecules[1].name} molecule. In'
                           ' the alignment process the molecules in low '
                           'resolution must be formed by connected atoms.'
                           ' Please, check you topologies.'))
        # Init the inputs to the backend
        if ignore_hydrogens:
            mol1_positions, restrictions = remove_hydrogens(molecules[0],
                                                            restrictions)
        else:
            mol1_positions = molecules[0].atoms_positions
        mol2_positions = molecules[1].atoms_positions
        mol2_bonds_info = molecules[1].bonds_distance
        mol2_com = molecules[1].geometric_center
        n_steps = self.STEPS_FACTOR*len(molecules[1])
        mol1_bonds_info = molecules[0].bonds_distance.values()
        translation_width = 2*min(t[1] for l in mol1_bonds_info for t in l)
        mol2_positions = minimize_molecules(mol1_positions, mol2_positions,
                                            mol2_com, self.SIGMA_SCALE,
                                            n_steps, restrictions,
                                            mol2_bonds_info, translation_width,
                                            deformation_types)
        # Change mol2 positions. Ensure that the changes are applied.
        if len(self.start) < len(self.end):
            self.start.atoms_positions = mol2_positions
        else:
            self.end.atoms_positions = mol2_positions

[docs]    def init_exchange_map(self, scale_factor: float = 0.5):
        """
        Initializes the exchange map with the current molecules configuration.

        Parameters
        ----------
        scale_factor : float
            The compression factor to apply to mapped molecules (see the
            ExchangeMap class documentation for more information).

        """
        if self.start is None or self.end is None:
            raise ValueError("Start and End molecules must be set before the exchange map")
        self.exchange_map = ExchangeMap(self.start, self.end, scale_factor)

[docs]    def write_comparative_gro(self, fname: str = None):
        """
        Writes a .gro file with start and end molecules to check the overlap.

        The molecules have START and END residue names respectively to ease the
        representation.

        Parameters
        ----------
        fname : string, optional
            The output .gro file name. If it is None, fname is set as:
            {name}_compare.gro, where name is the name of the molecule.

        """

        if self.start is None or self.end is None:
            raise ValueError("Start and End molecules must be set before writng the comparative")

        if fname is None:
            fname = '{}_compare.gro'.format(self.start.name)
        start = self.start.deep_copy()
        end = self.end.deep_copy()
        start.resnames = 'START'  # type: ignore
        end.resnames = 'END'  # type: ignore
        start.resids = 1  # type: ignore
        end.resids = 2  # type: ignore
        start.atoms_velocities = None
        end.atoms_velocities = None
        with open_coordinate_file(fname, 'w') as fgro:
            for atom in start:
                fgro.writeline(atom.gro_line())
            for atom in end:
                fgro.writeline(atom.gro_line())

[docs]    def interactive_restrictions(self) -> Tuple['Widget',Restriction]:
        """
        Creates the widget to visually generate the restrictions for alignment.

        Returns
        -------
        restriction_widget: ipywidgets.Widget
            The widget that contains the constraint generator.
        restrictions: List[Tuple[int, int]]
            The restrictions that will be generated by the widget. This will
            be initially empty and it will be filled as the widget is used.
        """
        if self.start is None or self.end is None:
            raise IOError(('Molecules in initial and final resolution must be'
                           ' assigned.'))

        from ._represent import create_widget_restrictions

        start_molecule = self.start
        end_molecule = self.end
        start_molecule.move_to([0, 0, 0])
        end_molecule.move_to([0, 0, 0])

        box, restrictions = create_widget_restrictions(start_molecule, end_molecule)

        return box, restrictions


[docs]def remove_hydrogens(molecule: Molecule,
                     restrictions: List[Tuple[int, int]]) -> Tuple[np.ndarray,
                                                                   List[Tuple[int, int]]]:
    """
    Returns positions of atoms that are not hydrogens and fixes restrictions.

    Parameters
    ----------
    molecule : Molecule
        The molecule to remove hydrogens.
    restrictions : list of tuple of int
        A list of tuples with pairs of atom numbers corresponding to start
        and end atoms molecules. The align will be performed privileging
        configurations where those atoms are close. By default is set to [].

        Example
        -------
        >>> restrictions = [(1, 3), (4, 5)]

        **IMPORTANT: INDEX ARE REFERENCED TO THE ATOM INDEX IN THE MOLECULE**

    Returns
    -------
    new_positions : numpy.ndarray
        The positions of the atoms that are not hydrogens.
    new_restrictions : list of tuple of int
        The restrictions with the correct indexes.

    """
    positions = []
    index_1map = {}
    for index, atom in enumerate(molecule):
        if atom.element != 'H':
            positions.append(atom.position)
            index_1map[index] = len(positions) - 1
    new_restrictions = []
    for index_1, index_2 in restrictions:
        if index_1 in index_1map:
            new_restrictions.append((index_1map[index_1], index_2))
    return np.array(positions), new_restrictions


[docs]def guess_protein_restrains(mol1: Molecule,
                            mol2: Molecule) -> List[Tuple[int, int]]:
    """
    Guess restrains for molecules with multiple residues.

    Checks if mol1 and mol2 have the same residue names and create restrains
    pairing atoms with the same residues. This function only works with
    molecules with multiple residues.

    Parameters
    ----------
    mol1 : Molecule or MoleculeGro
        The first molecule to find the restrains.
    mol2 : Molecule or MoleculeGro
        The second molecule to find the restrains.

    Returns
    -------
    restrains : list of tuple of int
        A list of tuples with pairs of atom numbers corresponding to start
        and end atoms molecules. The align will be performed privileging
        configurations where those atoms are close. By default is set to [].

        Example
        -------
        >>> restrictions = [(1, 3), (4, 5)]

        **IMPORTANT: INDEX ARE REFERENCED TO THE ATOM INDEX IN THE MOLECULE
        (STARTS IN 0).**

    Raises
    ------
    IOError
        If the input molecules have different residue names.
    """
    if len(mol1.resnames) != len(mol2.resnames):
        raise IOError(('Input molecules must have the same number of residues:'
                       ' {}, {}.'.format(len(mol1.resnames),
                                         len(mol2.resnames))))
    if mol1.resnames != mol2.resnames:
        # Special case where resnames are similar
        for res1, res2 in zip(mol1.resnames, mol2.resnames):
            if (res1 in res2) or (res2 in res1):
                continue
            raise IOError(('Input molecules must have the same residues in the'
                           ' same order.'))
    restrains: List[Tuple[int, int]] = []
    offset1 = 0
    offset2 = 0
    for mol_res1, mol_res2 in zip(mol1.residues, mol2.residues):
        restrains += guess_residue_restrains(mol_res1, mol_res2, offset1,
                                              offset2)
        offset1 += len(mol_res1)
        offset2 += len(mol_res2)
    return restrains


[docs]def guess_residue_restrains(res1: Residue, res2: Residue, offset1: int = 0,
                            offset2: int = 0) -> List[Tuple[int, int]]:
    """
    Guess restrains for molecules with one residue.

    Parameters
    ----------
    res1 : Residue
        The first residue to find the restrains.
    res2 : Residue
        The second residue to find the restrains.
    offset1 : int
        An offset to add to the atom index of res1.
    offset2 : int
        An offset to add to the atom index of res2.

    Returns
    -------
    restrains : list of tuple of int
        A list of tuples with pairs of atom numbers corresponding to start
        and end atoms residues. The align will be performed privileging
        configurations where those atoms are close. By default is set to [].

        Example
        -------
        >>> restrictions = [(1, 3), (4, 5)]

        **IMPORTANT: INDEX ARE REFERENCED TO THE ATOM INDEX IN THE MOLECULE
        (STARTS IN 0).**
    """
    n_parts = min(len(res1), len(res2))
    mol1_ids_groups = _split_list(list(range(len(res1))), n_parts)
    mol2_ids_groups = _split_list(list(range(len(res2))), n_parts)
    restr: List[Tuple[int, int]] = []
    for group1, group2 in zip(mol1_ids_groups, mol2_ids_groups):
        restr += [(i+offset1, j+offset2) for i in group1 for j in group2]
    return restr


def _split_list(alist: List[Any], wanted_parts: int) -> List[List[Any]]:
    length = len(alist)
    return [alist[i*length // wanted_parts: (i+1)*length // wanted_parts]
            for i in range(wanted_parts)]