Source code for pyobo.api.xrefs

# -*- coding: utf-8 -*-

"""High-level API for synonyms."""

import logging
from functools import lru_cache
from typing import Mapping, Optional

import bioregistry
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from .utils import get_version
from ..constants import TARGET_ID, TARGET_PREFIX
from ..getters import get_ontology
from ..identifier_utils import wrap_norm_prefix
from ..utils.cache import cached_df, cached_mapping
from ..utils.path import prefix_cache_join

__all__ = [
    "get_xrefs_df",
    "get_filtered_xrefs",
    "get_xref",
    "get_xrefs",
    "get_sssom_df",
]

logger = logging.getLogger(__name__)


[docs]@wrap_norm_prefix
def get_xref(prefix: str, identifier: str, new_prefix: str, flip: bool = False) -> Optional[str]:
    """Get the xref with the new prefix if a direct path exists."""
    filtered_xrefs = get_filtered_xrefs(prefix, new_prefix, flip=flip)
    return filtered_xrefs.get(identifier)


[docs]@lru_cache()
@wrap_norm_prefix
def get_filtered_xrefs(
    prefix: str,
    xref_prefix: str,
    flip: bool = False,
    *,
    use_tqdm: bool = False,
    force: bool = False,
    strict: bool = False,
    version: Optional[str] = None,
) -> Mapping[str, str]:
    """Get xrefs to a given target."""
    if version is None:
        version = get_version(prefix)
    path = prefix_cache_join(prefix, "xrefs", name=f"{xref_prefix}.tsv", version=version)
    all_xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=version)
    header = [f"{prefix}_id", f"{xref_prefix}_id"]

    @cached_mapping(path=path, header=header, use_tqdm=use_tqdm, force=force)
    def _get_mapping() -> Mapping[str, str]:
        if all_xrefs_path.is_file():
            logger.info("[%s] loading pre-cached xrefs", prefix)
            df = pd.read_csv(all_xrefs_path, sep="\t", dtype=str)
            logger.info("[%s] filtering pre-cached xrefs", prefix)
            df = df.loc[df[TARGET_PREFIX] == xref_prefix, [f"{prefix}_id", TARGET_ID]]
            return dict(df.values)

        logger.info("[%s] no cached xrefs found. getting from OBO loader", prefix)
        ontology = get_ontology(prefix, force=force, strict=strict, version=version)
        return ontology.get_filtered_xrefs_mapping(xref_prefix, use_tqdm=use_tqdm)

    rv = _get_mapping()
    if flip:
        return {v: k for k, v in rv.items()}
    return rv


get_xrefs = get_filtered_xrefs


[docs]@wrap_norm_prefix
def get_xrefs_df(
    prefix: str,
    *,
    use_tqdm: bool = False,
    force: bool = False,
    strict: bool = False,
    version: Optional[str] = None,
) -> pd.DataFrame:
    """Get all xrefs."""
    if version is None:
        version = get_version(prefix)
    path = prefix_cache_join(prefix, name="xrefs.tsv", version=version)

    @cached_df(path=path, dtype=str, force=force)
    def _df_getter() -> pd.DataFrame:
        logger.info("[%s] no cached xrefs found. getting from OBO loader", prefix)
        ontology = get_ontology(prefix, force=force, strict=strict, version=version)
        return ontology.get_xrefs_df(use_tqdm=use_tqdm)

    return _df_getter()


[docs]@wrap_norm_prefix
def get_sssom_df(
    prefix: str,
    *,
    predicate_id: str = "oboinowl:hasDbXref",
    justification: str = "sempav:UnspecifiedMatching",
    **kwargs,
) -> pd.DataFrame:
    r"""Get xrefs from a source as an SSSOM dataframe.

    :param prefix: The ontology to look in for xrefs
    :param predicate_id: The predicate used in the SSSOM document. By default, ontologies
        don't typically ascribe semantics to xrefs so ``oboinowl:hasDbXref`` is used
    :param justification: The justification for the mapping. By default, ontologies
        don't typically ascribe semantics, so this is left with `sempav:UnspecifiedMatching`
    :returns: A SSSOM-compliant dataframe of xrefs

    For example, if you want to get UMLS as an SSSOM dataframe, you can do

    >>> import pyobo
    >>> df = pyobo.get_sssom_df("umls")
    >>> df.to_csv("umls.sssom.tsv", sep="\t", index=False)

    .. note:: This assumes the Bioregistry as the prefix map
    """
    from .names import get_name

    df = get_xrefs_df(prefix=prefix, **kwargs)
    with logging_redirect_tqdm():
        rows = [
            (
                bioregistry.curie_to_str(prefix, source_id),
                get_name(prefix, source_id) or "",
                bioregistry.curie_to_str(target_prefix, target_id),
                get_name(target_prefix, target_id),
                predicate_id,
                justification,
            )
            for source_id, target_prefix, target_id in tqdm(
                df.values, unit="mapping", unit_scale=True
            )
        ]
    return pd.DataFrame(
        rows,
        columns=[
            "subject_id",
            "subject_label",
            "object_id",
            "object_label",
            "predicate_id",
            "mapping_justification",
        ],
    )