Source code for pyobo.api.xrefs

# -*- coding: utf-8 -*-

"""High-level API for synonyms."""

import logging
from functools import lru_cache
from typing import Mapping, Optional

import bioregistry
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from .utils import get_version
from ..constants import TARGET_ID, TARGET_PREFIX
from ..getters import get_ontology
from ..identifier_utils import wrap_norm_prefix
from ..utils.cache import cached_df, cached_mapping
from ..utils.path import prefix_cache_join

__all__ = [
    "get_xrefs_df",
    "get_filtered_xrefs",
    "get_xref",
    "get_xrefs",
    "get_sssom_df",
]

logger = logging.getLogger(__name__)


[docs]@wrap_norm_prefix def get_xref(prefix: str, identifier: str, new_prefix: str, flip: bool = False) -> Optional[str]: """Get the xref with the new prefix if a direct path exists.""" filtered_xrefs = get_filtered_xrefs(prefix, new_prefix, flip=flip) return filtered_xrefs.get(identifier)
[docs]@lru_cache() @wrap_norm_prefix def get_filtered_xrefs( prefix: str, xref_prefix: str, flip: bool = False, *, use_tqdm: bool = False, force: bool = False, strict: bool = False, version: Optional[str] = None, ) -> Mapping[str, str]: """Get xrefs to a given target.""" if version is None: version = get_version(prefix) path = prefix_cache_join(prefix, "xrefs", name=f"{xref_prefix}.tsv", version=version) all_xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=version) header = [f"{prefix}_id", f"{xref_prefix}_id"] @cached_mapping(path=path, header=header, use_tqdm=use_tqdm, force=force) def _get_mapping() -> Mapping[str, str]: if all_xrefs_path.is_file(): logger.info("[%s] loading pre-cached xrefs", prefix) df = pd.read_csv(all_xrefs_path, sep="\t", dtype=str) logger.info("[%s] filtering pre-cached xrefs", prefix) df = df.loc[df[TARGET_PREFIX] == xref_prefix, [f"{prefix}_id", TARGET_ID]] return dict(df.values) logger.info("[%s] no cached xrefs found. getting from OBO loader", prefix) ontology = get_ontology(prefix, force=force, strict=strict, version=version) return ontology.get_filtered_xrefs_mapping(xref_prefix, use_tqdm=use_tqdm) rv = _get_mapping() if flip: return {v: k for k, v in rv.items()} return rv
get_xrefs = get_filtered_xrefs
[docs]@wrap_norm_prefix def get_xrefs_df( prefix: str, *, use_tqdm: bool = False, force: bool = False, strict: bool = False, version: Optional[str] = None, ) -> pd.DataFrame: """Get all xrefs.""" if version is None: version = get_version(prefix) path = prefix_cache_join(prefix, name="xrefs.tsv", version=version) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: logger.info("[%s] no cached xrefs found. getting from OBO loader", prefix) ontology = get_ontology(prefix, force=force, strict=strict, version=version) return ontology.get_xrefs_df(use_tqdm=use_tqdm) return _df_getter()
[docs]@wrap_norm_prefix def get_sssom_df( prefix: str, *, predicate_id: str = "oboinowl:hasDbXref", justification: str = "sempav:UnspecifiedMatching", **kwargs, ) -> pd.DataFrame: r"""Get xrefs from a source as an SSSOM dataframe. :param prefix: The ontology to look in for xrefs :param predicate_id: The predicate used in the SSSOM document. By default, ontologies don't typically ascribe semantics to xrefs so ``oboinowl:hasDbXref`` is used :param justification: The justification for the mapping. By default, ontologies don't typically ascribe semantics, so this is left with `sempav:UnspecifiedMatching` :returns: A SSSOM-compliant dataframe of xrefs For example, if you want to get UMLS as an SSSOM dataframe, you can do >>> import pyobo >>> df = pyobo.get_sssom_df("umls") >>> df.to_csv("umls.sssom.tsv", sep="\t", index=False) .. note:: This assumes the Bioregistry as the prefix map """ from .names import get_name df = get_xrefs_df(prefix=prefix, **kwargs) with logging_redirect_tqdm(): rows = [ ( bioregistry.curie_to_str(prefix, source_id), get_name(prefix, source_id) or "", bioregistry.curie_to_str(target_prefix, target_id), get_name(target_prefix, target_id), predicate_id, justification, ) for source_id, target_prefix, target_id in tqdm( df.values, unit="mapping", unit_scale=True ) ] return pd.DataFrame( rows, columns=[ "subject_id", "subject_label", "object_id", "object_label", "predicate_id", "mapping_justification", ], )