Source code for omnipath._core.requests.interactions._utils

from typing import Any, Dict, Mapping, Optional

import pandas as pd

from omnipath.constants._constants import InteractionDataset
from omnipath._core.requests._utils import _ERROR_EMPTY_FMT
from omnipath._core.requests._intercell import Intercell
from omnipath._core.requests.interactions._interactions import (
    Datasets_t,
    AllInteractions,
)


def _to_dict(mapping: Optional[Mapping[Any, Any]]) -> Dict[Any, Any]:
    return {} if mapping is None else dict(mapping)


def _swap_undirected(df: pd.DataFrame) -> pd.DataFrame:
    if "is_directed" not in df.columns:
        raise KeyError(f"Key `'is_directed'` not found in `{list(df.columns)}`.")

    directed = df.pop("is_directed")

    undirected = df.loc[~directed, :]
    if undirected.empty:
        return df

    undirected_swapped = undirected.copy()
    undirected_swapped[["source", "target"]] = undirected[["target", "source"]]

    if "source_genesymbol" in undirected:
        undirected_swapped[["source_genesymbol", "target_genesymbol"]] = undirected[
            ["target_genesymbol", "source_genesymbol"]
        ]
    if "ncbi_tax_id_source" in undirected.columns:
        undirected_swapped[["ncbi_tax_id_source", "ncbi_tax_id_target"]] = undirected[
            ["ncbi_tax_id_target", "ncbi_tax_id_source"]
        ]

    return pd.concat(
        [directed, undirected, undirected_swapped],
        axis=0,
        ignore_index=True,
    )


[docs]def import_intercell_network( include: Datasets_t = ( InteractionDataset.OMNIPATH, InteractionDataset.PATHWAY_EXTRA, InteractionDataset.KINASE_EXTRA, InteractionDataset.LIGREC_EXTRA, ), interactions_params: Optional[Mapping[str, Any]] = None, transmitter_params: Optional[Mapping[str, Any]] = None, receiver_params: Optional[Mapping[str, Any]] = None, ) -> pd.DataFrame: """ Import intercellular network combining intercellular annotations and protein interactions. First, it imports a network of protein-protein interactions. Then, it retrieves annotations about the proteins intercellular communication roles, once for the transmitter (delivering information from the expressing cell) and second, the receiver (receiving signal and relaying it towards the expressing cell) side. These 3 queries can be customized by providing parameters which will be passed to :meth:`omnipath.interactions.OmniPath.get` for the network and :meth:`omnipath.requests.Intercell` for the annotations. Finally the 3 :class:`pandas.DataFrame` are combined in a way that the source proteins in each interaction annotated by the transmitter, and the target proteins by the receiver categories. If undirected interactions present (these are disabled by default) they will be duplicated, i.e. both partners can be both receiver and transmitter. Parameters ---------- include Interaction datasets to include for :meth:`omnipath.interactions.AllInteractions.get`. interactions_params Parameters for the :meth:`omnipath.interactions.AllInteractions.get`. transmitter_params Parameters defining the transmitter side of intercellular connections. See :meth:`omnipath.interactions.AllInteractions.params` for available values. receiver_params Parameters defining the receiver side of intercellular connections. See :meth:`omnipath.interactions.AllInteractions.params` for available values. Returns ------- :class:`pandas.DataFrame` A dataframe containing information about protein-protein interactions and the inter-cellular roles of the proteins involved in those interactions. """ interactions_params = _to_dict(interactions_params) transmitter_params = _to_dict(transmitter_params) receiver_params = _to_dict(receiver_params) # TODO: this should be refactored as: QueryType.INTERCELL("scope").param, etc. (also in many other places) transmitter_params.setdefault("causality", "trans") transmitter_params.setdefault("scope", "generic") receiver_params.setdefault("causality", "rec") receiver_params.setdefault("scope", "generic") interactions = AllInteractions.get(include=include, **interactions_params) if interactions.empty: raise ValueError(_ERROR_EMPTY_FMT.format(obj="interactions")) interactions = _swap_undirected(interactions) transmitters = Intercell.get(**transmitter_params) if transmitters.empty: raise ValueError(_ERROR_EMPTY_FMT.format(obj="transmitters")) receivers = Intercell.get(**receiver_params) if receivers.empty: raise ValueError(_ERROR_EMPTY_FMT.format(obj="receivers")) # fmt: off intracell = ['intracellular_intercellular_related', 'intracellular'] transmitters = transmitters.loc[~transmitters["parent"].isin(intracell), :].copy() transmitters.rename(columns={"source": "category_source"}, inplace=True) # this makes it 3x as fast during groupby, since all of these are categories # it's mostly because groupby needs observed=True + using string object (numpy) vs "string" transmitters[["category", "parent", "database"]] = transmitters[["category", "parent", "database"]].astype(str) receivers = receivers.loc[~receivers["parent"].isin(intracell), :].copy() receivers.rename(columns={"source": "category_source"}, inplace=True) receivers[["category", "parent", "database"]] = receivers[["category", "parent", "database"]].astype(str) res = pd.merge(interactions, transmitters, left_on="source", right_on="uniprot", how="inner") if res.empty: raise ValueError("No values are left after merging interactions and transmitters.") gb = res.groupby(["category", "parent", "source", "target"], as_index=False) # fmt: on res = gb.nth(0).copy() # much faster than 1st res["database"] = gb["database"].apply(";".join)["database"].astype(str) res = pd.merge( res, receivers, how="inner", left_on="target", right_on="uniprot", suffixes=("_intercell_source", "_intercell_target"), ) if res.empty: raise ValueError("No values are left after merging interactions and receivers.") gb = res.groupby( [ "category_intercell_source", "parent_intercell_source", "source", "target", "category_intercell_target", "parent_intercell_target", ], as_index=False, ) res = gb.nth(0).copy() res["database_intercell_target"] = ( gb["database_intercell_target"] .apply(";".join)["database_intercell_target"] .astype(str) ) # retype back as categories for col in ["category", "parent"]: for suffix in ["_intercell_source", "_intercell_target"]: res[f"{col}{suffix}"] = res[f"{col}{suffix}"].astype("category") return res.reset_index(drop=True)