Source code for lipyds.analysis.dei

"""
Lipid Depletion-Enrichment Index
================================

Classes
-------

.. autoclass:: LipidEnrichment
    :members:

"""

from typing import Union

import scipy
import numpy as np

from MDAnalysis.core.universe import Universe
from MDAnalysis.core.groups import AtomGroup
from MDAnalysis.analysis import distances
from .base import LeafletAnalysisBase


[docs] class LipidEnrichment(LeafletAnalysisBase): r"""Calculate the lipid depletion-enrichment index around a protein by leaflet. The depletion-enrichment index (DEI) of a lipid around a protein indicates how enriched or depleted that lipid in the lipid annulus around the protein, with respect to the density of the lipid in the bulk membrane. The results of this analysis contain these values: * 'Near protein': the number of lipids :math:`L` within cutoff :math:`r` of the protein * 'Fraction near protein': the fraction of the lipids :math:`L` with respect to all lipids within cutoff :math:`r` of the protein: :math:`\frac{n(x_{(L, r)})}{n(x_r)}` * 'Enrichment': the depletion-enrichment index. The hard-cutoff, gaussian distribution algorithm was obtained from [Corradi2018]_. The soft-cutoff, binomial distribution algorithm was first published in [Wilson2021]_. Please cite them if you use this analysis in published work. Parameters ---------- universe: Universe or AtomGroup The atoms to apply this analysis to. select: str (optional) A :meth:`Universe.select_atoms` selection string for atoms that define the lipid head groups, e.g. "name PO4" or "name P*" select_protein: str (optional) Selection string for the protein. cutoff: float (optional) Cutoff in ångström buffer: float (optional) buffer zone length in ångström. If > 0, this means a soft cutoff is implemented. beta: float (optional) beta controls the sharpness of soft-cutoff. distribution: str (optional) Whether to use the binomial or gaussian distribution **kwargs Passed to :class:`~lipyds.analysis.base.LeafletAnalysisBase`. Attributes ---------- dei_by_leaflet: list of dicts A list of dictionaries of time series data for each leaflet. The first dictionary is for the first leaflet, etc. Leaflets are sorted by z-coordinate; the first leaflet has the lowest z-coordinate. leaflets_summary: list of dicts A list of summary dictionaries for each leaflet. The first dictionary is for the first leaflet, etc. """ def __init__(self, universe: Union[AtomGroup, Universe], select_protein: str = "protein", cutoff: float = 6, distribution: str = "binomial", buffer: float = 0, beta: float = 4, **kwargs): super(LipidEnrichment, self).__init__(universe, **kwargs) self.distribution = distribution.lower() if self.distribution == "binomial": self._fit_distribution = self._fit_binomial elif self.distribution == "gaussian": self._fit_distribution = self._fit_gaussian else: raise ValueError("`distribution` should be either " "'binomial' or 'gaussian'") self.protein = self.universe.select_atoms(select_protein) self.cutoff = cutoff self.buffer = buffer self.beta = beta def _prepare(self): # in case of change + re-run self.mid_buffer = self.buffer / 2.0 self.max_cutoff = self.cutoff + self.buffer self._buffer_sigma = self.buffer / self.beta if self._buffer_sigma: self._buffer_coeff = 1 / (self._buffer_sigma * np.sqrt(2 * np.pi)) self.unique_ids = np.unique(self.ids) # results self.near_counts = np.zeros((self.n_leaflets, len(self.ids), self.n_frames)) self.residue_counts = np.zeros((self.n_leaflets, len(self.ids), self.n_frames)) self.total_counts = np.zeros((self.n_leaflets, self.n_frames)) def _update_leaflets(self): super()._update_leaflets() self._set_leaflets_with_outside() self._current_leaflet_residues = {} self._current_leaflet_ids = {} self._cache = {} for i, residues in enumerate(self.leaflet_residues): leaflet_residues = sum(residues) if len(residues) else self.residues[[]] self._current_leaflet_residues[i] = leaflet_residues self._current_leaflet_ids[i] = getattr(leaflet_residues, self.group_by_attr) def _single_frame(self): # initial scoop for nearby groups coords_ = self.selection.positions pairs = distances.capped_distance(self.protein.positions, coords_, self.cutoff, box=self.protein.dimensions, return_distances=False) if pairs.size > 0: indices = np.unique(pairs[:, 1]) else: indices = [] # now look for groups in the buffer if len(indices) and self.buffer: pairs2, dist = distances.capped_distance(self.protein.positions, coords_, self.max_cutoff, min_cutoff=self.cutoff, box=self.protein.dimensions, return_distances=True) # don't count things in inner cutoff mask = [x not in indices for x in pairs2[:, 1]] pairs2 = pairs2[mask] dist = dist[mask] if pairs2.size > 0: _ix = np.argsort(pairs2[:, 1]) indices2 = pairs2[_ix][:, 1] dist = dist[_ix] - self.cutoff init_resix2 = self.selection.resindices[indices2] # sort through for minimum distance ids2, splix = np.unique(init_resix2, return_index=True) resix2 = init_resix2[splix] split_dist = np.split(dist, splix[1:]) min_dist = np.array([x.min() for x in split_dist]) # logistic function for i, leaflet_residues in self._current_leaflet_residues.items(): ids = self._current_leaflet_ids[i] match, rix, lix = np.intersect1d(resix2, leaflet_residues.resindices, assume_unique=True, return_indices=True) subdist = min_dist[rix] subids = ids[lix] for j, x in enumerate(self.ids): mask = (subids == x) xdist = subdist[mask] exp = -0.5 * ((xdist/self._buffer_sigma) ** 2) n = self._buffer_coeff * np.exp(exp) self.near_counts[i, j, self._frame_index] += n.sum() soft = self.near_counts[:, :, self._frame_index].sum() init_resix = self.selection.resindices[indices] resix = np.unique(init_resix) for i, leaflet_residues in self._current_leaflet_residues.items(): ids = self._current_leaflet_ids[i] _, ix1, ix2 = np.intersect1d(resix, leaflet_residues.resindices, assume_unique=True, return_indices=True) self.total_counts[i, self._frame_index] = len(ix1) subids = ids[ix2] for j, x in enumerate(self.ids): self.residue_counts[i, j, self._frame_index] += sum(ids == x) self.near_counts[i, j, self._frame_index] += sum(subids == x) both = self.near_counts[:, :, self._frame_index].sum() def _conclude(self): self.dei_by_leaflet = [] self.leaflets_summary = [] for i in range(self.n_leaflets): timeseries = {} summary = {} res_counts = self.residue_counts[i] near_counts = self.near_counts[i] near_all = near_counts.sum(axis=0) total_all = res_counts.sum(axis=0) n_near_tot = near_all.sum() n_all_tot = total_all.sum() d, s = self._collate(near_all, near_all, total_all, total_all, n_near_tot, n_all_tot) timeseries['all'] = d summary['all'] = s for j, resname in enumerate(self.ids): near_species = near_counts[j] total_species = res_counts[j] d, s = self._collate(near_species, near_all, total_species, total_all, n_near_tot, n_all_tot) timeseries[resname] = d summary[resname] = s self.dei_by_leaflet.append(timeseries) self.leaflets_summary.append(summary) def _fit_gaussian(self, data, *args, **kwargs): """Treat each frame as an independent observation in a gaussian distribution. Appears to be original method of [Corradi2018]_. .. note:: The enrichment p-value is calculated from a two-tailed sample T-test, following [Corradi2018]_. """ near = data['Near protein'] frac = data['Fraction near protein'] dei = data['Enrichment'] summary = { 'Mean near protein': near.mean(), 'SD near protein': near.std(), 'Mean fraction near protein': frac.mean(), 'SD fraction near protein': frac.std(), 'Mean enrichment': dei.mean(), 'SD enrichment': dei.std() } return summary def _fit_binomial(self, data: dict, n_near_species: np.ndarray, n_near: np.ndarray, n_species: np.ndarray, n_all: np.ndarray, n_near_tot: int, n_all_tot: int): """ This function computes the following approximate probability distributions and derives statistics accordingly. * The number of lipids near the protein is represented as a normal distribution. * The fraction of lipids near the protein follows a hypergeometric distribution. * The enrichment is represented as the log-normal distribution derived from the ratio of two binomial convolutions of the frame-by-frame binomial distributions. All these approximations assume that each frame or observation is independent. The binomial approximation assumes that: * the number of the lipid species near the protein is small compared to the total number of that lipid species * the total number of all lipids is large * the fraction (n_species / n_all) is not close to 0 or 1. .. note:: The enrichment p-value is calculated from the log-normal distribution of the null hypothesis: that the average enrichment is representative of the ratio of n_species : n_all """ summary = {"Total # lipids, all": n_all_tot, "Total # lipids, shell": n_near_tot} p_time = data['Fraction near protein'] summary['Total # species, shell'] = N = n_near_species.sum() summary['Total # species, all'] = N_sp = n_species.sum() if n_near_tot: # catch zeros p_shell = N / n_near_tot else: p_shell = 0 if n_all_tot: p_null = N_sp / n_all_tot else: p_null = 0 # n events: assume normal summary['Mean # species, shell'] = n_near_species.mean() summary['SD # species, shell'] = sd = n_near_species.std() # actually hypergeometric, but binomials are easier # X ~ B(n_near_tot, p_shell) summary['Mean fraction of species, shell'] = p_shell summary['SD fraction of species, shell'] = sd_frac = sd / n_near.mean() if p_null == 0: summary['Mean enrichment'] = 1 summary['SD enrichment'] = 0 else: summary['Mean enrichment'] = p_shell / p_null summary['SD enrichment'] = sd_frac / p_null return summary def _collate(self, n_near_species: np.ndarray, n_near: np.ndarray, n_species: np.ndarray, n_all: np.ndarray, n_near_tot: int, n_all_tot: int): data = {} data['Near protein'] = n_near_species frac = np.nan_to_num(n_near_species / n_near, nan=0.0) data['Fraction near protein'] = frac data['Total number'] = n_species n_total = np.nan_to_num(n_species / n_all, nan=0.0) # data['Fraction total'] = n_total n_total[n_total == 0] = np.nan dei = np.nan_to_num(frac / n_total, nan=0.0) data['Enrichment'] = dei summary = self._fit_distribution(data, n_near_species, n_near, n_species, n_all, n_near_tot, n_all_tot) return data, summary
[docs] def summary_as_dataframe(self): """Convert the results summary into a pandas DataFrame. This requires pandas to be installed. """ if not self.leaflets_summary: raise ValueError('Call run() first to get results') try: import pandas as pd except ImportError: raise ImportError('pandas is required to use this function ' 'but is not installed. Please install with ' '`conda install pandas` or ' '`pip install pandas`.') from None dfs = [pd.DataFrame.from_dict(d, orient='index') for d in self.leaflets_summary] for i, df in enumerate(dfs, 1): df['Leaflet'] = i df = pd.concat(dfs) return df