Source code for omnipath._core.requests._annotations

from typing import Any, Dict, Union, Mapping, Iterable, Optional
import logging

import pandas as pd

from omnipath._misc import dtypes
from omnipath._core.query import QueryType
from omnipath._core.utils._docs import d
from omnipath._core.requests._request import OmnipathRequestABC
from omnipath.constants._pkg_constants import Key, final

_MAX_N_PROTS = 600


[docs]@final
class Annotations(OmnipathRequestABC):
    """Request annotations from [OmniPath]_."""

    __string__ = frozenset({"source", "value"})
    __categorical__ = frozenset({"entity_type", "label", "source"})

    _query_type = QueryType.ANNOTATIONS

    def _modify_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
        params.pop(Key.ORGANISM.value, None)

        return params

[docs]    @classmethod
    @d.dedent
    def params(cls) -> Dict[str, Any]:
        """%(query_params)s"""
        params = super().params()
        params.pop(Key.ORGANISM.value, None)

        return params

[docs]    @classmethod
    def get(
        cls,
        proteins: Optional[Union[str, Iterable[str]]] = None,
        resources: Optional[Union[str, Iterable[str]]] = None,
        force_full_download: bool = False,
        wide: bool = False,
        **kwargs,
    ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
        """
        Import annotations from [OmniPath]_.

        Retrieves protein annotations about function, localization, expression, structure and other properties of
        proteins from `OmniPath <https://omnipathdb.org/annotations>`__.

        Parameters
        ----------
        proteins
            Genes or proteins for which annotations will be retrieved (UniProt IDs, HGNC Gene Symbols or miRBase IDs).

            In order to download annotations for proteins complexes, write **'COMPLEX:'** before the gene symbols of
            the genes integrating the complex.

            If `None`, fetch annotations for all available genes or proteins.
        resources
            Load the annotations only from these databases. See :meth:`resources` for available options.
            If `None`, use all available resources.
        force_full_download
            Force the download of the entire annotations dataset. The full size of the data is ~1GB.
            We recommend to retrieve the annotations for a set of proteins or only from a few resources,
            depending on your interest.
        wide
            Pivot the annotations from a long to a wide dataframe format, reconstituting the format
            of the original resource.
        kwargs
            Additional query parameters.

        Returns
        -------
        :class:`pandas.DataFrame`
            A dataframe containing different molecule (protein, complex, gene, miRNA, small molecule) annotations.
            If `wide` is `True` and the result contains more than one resource, a `dict` of dataframes
            will be returned, one for each resource.

        Notes
        -----
        There might be also a few miRNAs and small molecules annotated. A vast majority of protein complex
        annotations are inferred from the annotations of the members: if all members carry the same annotation
        the complex inherits.
        """
        if proteins is None and resources is None and not force_full_download:
            raise ValueError(
                "Please specify `force_full_download=True` in order to download the full dataset."
            )
        res_info = (
            "all resources"
            if resources is None
            else f"the following resources: `{[resources] if isinstance(resources, str) else sorted(set(resources))}`"
        )
        inst = cls()
        inst._wide = wide

        if proteins is not None:
            if isinstance(proteins, str):
                proteins = (proteins,)
            proteins = sorted(set(proteins))

            logging.info(
                f"Downloading annotations for `{len(proteins)}` in `{_MAX_N_PROTS}` chunks from {res_info}"
            )

            return pd.concat(
                [
                    inst._get(
                        proteins=proteins[i * _MAX_N_PROTS : (i + 1) * _MAX_N_PROTS],
                        resources=resources,
                        **kwargs,
                    )
                    for i in range((len(proteins) // _MAX_N_PROTS) + 1)
                    if len(proteins[i * _MAX_N_PROTS : (i + 1) * _MAX_N_PROTS])
                ]
            )

        logging.info(f"Downloading annotations for all proteins from {res_info}")

        return inst._get(proteins=None, resources=resources, **kwargs)

    def _resource_filter(self, data: Mapping[str, Any], **_) -> bool:
        return True

    def _post_process(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
        if self._wide:
            df = self.pivot_annotations(df)

        return df

[docs]    @classmethod
    def pivot_annotations(
        cls,
        df: pd.DataFrame,
    ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
        """
        Annotations from narrow to wide format

        Converts the annotations from a long to a wide dataframe format,
        reconstituting the format of the original resource.

        Parameters
        ----------
        df
            An annotation dataframe.

        Returns
        -------
        :class:`pandas.DataFrame` or `dict`
            A dataframe of various molecule (protein, complex, gene, miRNA, small molecule) annotations.
            If the data contains more than one resource, a `dict` of dataframes will be returned, one for each
            resource.
        """
        if df.source.nunique() > 1:
            return {
                resource: cls.pivot_annotations(df[df.source == resource])
                for resource in df.source.unique()
            }

        index_cols = ["record_id", "uniprot", "genesymbol", "label"]

        if "entity_type" in df.label.values:
            df = df.drop("entity_type", axis=1)

        else:
            index_cols.append("entity_type")

        return dtypes.auto_dtype(
            df.drop("source", axis=1)
            .set_index(index_cols)
            .unstack("label")
            .droplevel(axis=1, level=0)
            .reset_index()
            .drop("record_id", axis=1)
        )


__all__ = [Annotations]