"""High-level API for nomenclature."""
from __future__ import annotations
import logging
import subprocess
from collections.abc import Callable, Mapping
from functools import lru_cache
from typing import Any, TypeVar
import curies
import pandas as pd
import ssslm
from pystow.cache import Cached
from ssslm import LiteralMapping
from typing_extensions import Unpack
from .alts import get_primary_identifier
from .utils import _get_pi, get_version_from_kwargs
from ..constants import (
GetOntologyKwargs,
check_should_cache,
check_should_force,
)
from ..getters import NoBuildError, get_ontology
from ..identifier_utils import wrap_norm_prefix
from ..struct import Reference
from ..utils.cache import cached_collection, cached_df, cached_mapping
from ..utils.io import multidict
from ..utils.path import CacheArtifact, get_cache_path
__all__ = [
"get_definition",
"get_id_definition_mapping",
"get_id_name_mapping",
"get_id_synonyms_mapping",
"get_ids",
"get_literal_mappings",
"get_literal_mappings_df",
"get_name",
"get_name_by_curie",
"get_name_id_mapping",
"get_obsolete",
"get_obsolete_references",
"get_references",
"get_synonyms",
]
logger = logging.getLogger(__name__)
[docs]
def get_name_by_curie(curie: str, **kwargs: Any) -> str | None:
"""Get the name for a CURIE, if possible."""
return get_name(curie, **kwargs)
X = TypeVar("X")
NO_BUILD_PREFIXES: set[str] = set()
NO_BUILD_LOGGED: set = set()
def _help_get(
f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
reference: Reference,
**kwargs: Unpack[GetOntologyKwargs],
) -> X | None:
"""Get the result for an entity based on a mapping maker function ``f``."""
try:
mapping = f(reference.prefix, **kwargs) # type:ignore
except NoBuildError:
if reference.prefix not in NO_BUILD_PREFIXES:
logger.warning("[%s] unable to look up results with %s", reference.prefix, f)
NO_BUILD_PREFIXES.add(reference.prefix)
return None
except ValueError as e:
if reference.prefix not in NO_BUILD_PREFIXES:
logger.warning(
"[%s] value error while looking up results with %s: %s", reference.prefix, f, e
)
NO_BUILD_PREFIXES.add(reference.prefix)
return None
if not mapping:
if reference.prefix not in NO_BUILD_PREFIXES:
logger.warning("[%s] no results produced with %s", reference.prefix, f)
NO_BUILD_PREFIXES.add(reference.prefix)
return None
primary_id = get_primary_identifier(reference, **kwargs)
return mapping.get(primary_id)
[docs]
def get_name(
prefix: str | curies.Reference | curies.ReferenceTuple,
identifier: str | None = None,
/,
**kwargs: Unpack[GetOntologyKwargs],
) -> str | None:
"""Get the name for an entity."""
reference = _get_pi(prefix, identifier)
return _help_get(get_id_name_mapping, reference, **kwargs)
[docs]
@lru_cache
@wrap_norm_prefix
def get_ids(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> set[str]:
"""Get the set of identifiers for this prefix."""
if prefix == "ncbigene":
from ..sources.ncbi.ncbigene import get_ncbigene_ids
logger.info("[%s] loading name mappings", prefix)
rv = get_ncbigene_ids()
logger.info("[%s] done loading name mappings", prefix)
return rv
return {
reference.identifier
for reference in get_references(prefix, **kwargs)
if reference.prefix == prefix
}
class CachedReferences(Cached[list[Reference]]):
"""Make a function lazily cache its return value as file."""
def load(self) -> list[Reference]:
"""Load data from the cache as a list of strings.
:returns: A list of strings loaded from the cache
"""
with open(self.path) as file:
return [Reference.from_curie(line.strip()) for line in file]
def dump(self, references: list[Reference]) -> None:
"""Dump data to the cache as a list of strings.
:param references: The list of strings to dump
"""
with open(self.path, "w") as file:
for reference in references:
print(reference.curie, file=file)
[docs]
@wrap_norm_prefix
def get_references(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> set[Reference]:
"""Get the set of identifiers for this prefix."""
if prefix == "ncbigene":
from ..sources.ncbi.ncbigene import get_ncbigene_ids
logger.info("[%s] loading identifiers ", prefix)
rv = {Reference(prefix="ncbigene", identifier=i) for i in get_ncbigene_ids()}
logger.info("[%s] done loading identifiers", prefix)
return rv
version = get_version_from_kwargs(prefix, kwargs)
# TODO pre-cache these!
path = get_cache_path(prefix, CacheArtifact.references, version=version)
@CachedReferences(
path=path,
force=check_should_force(kwargs),
cache=check_should_cache(kwargs),
)
def _get_references() -> list[Reference]:
ontology = get_ontology(prefix, **kwargs)
return sorted(ontology.iterate_references())
try:
return set(_get_references())
except NoBuildError:
logger.debug("[%s] no build", prefix)
return set()
except (Exception, subprocess.CalledProcessError) as e:
logger.exception("[%s v%s] could not load: %s", prefix, version, e)
return set()
[docs]
@lru_cache
@wrap_norm_prefix
def get_id_name_mapping(
prefix: str,
**kwargs: Unpack[GetOntologyKwargs],
) -> Mapping[str, str]:
"""Get an identifier to name mapping for the OBO file."""
if prefix == "ncbigene":
from ..sources.ncbi.ncbigene import get_ncbigene_id_to_name_mapping
logger.info("[%s] loading identifiers", prefix)
rv = get_ncbigene_id_to_name_mapping()
logger.info("[%s] done loading identifiers", prefix)
return rv
version = get_version_from_kwargs(prefix, kwargs)
path = get_cache_path(prefix, CacheArtifact.names, version=version)
@cached_mapping(
path=path,
header=[f"{prefix}_id", "name"],
force=check_should_force(kwargs),
cache=check_should_cache(kwargs),
)
def _get_id_name_mapping() -> Mapping[str, str]:
ontology = get_ontology(prefix, **kwargs)
return ontology.get_id_name_mapping()
try:
return _get_id_name_mapping()
except NoBuildError:
logger.debug("[%s] no build", prefix)
return {}
except (Exception, subprocess.CalledProcessError) as e:
logger.exception("[%s v%s] could not load: %s", prefix, version, e)
return {}
[docs]
@lru_cache
@wrap_norm_prefix
def get_name_id_mapping(
prefix: str,
**kwargs: Unpack[GetOntologyKwargs],
) -> Mapping[str, str]:
"""Get a name to identifier mapping for the OBO file."""
id_name = get_id_name_mapping(prefix, **kwargs)
return {v: k for k, v in id_name.items()}
[docs]
def get_definition(
prefix: str | curies.Reference | curies.ReferenceTuple,
identifier: str | None = None,
/,
**kwargs: Unpack[GetOntologyKwargs],
) -> str | None:
"""Get the definition for an entity."""
reference = _get_pi(prefix, identifier)
return _help_get(get_id_definition_mapping, reference, **kwargs)
[docs]
def get_id_definition_mapping(
prefix: str, **kwargs: Unpack[GetOntologyKwargs]
) -> Mapping[str, str]:
"""Get a mapping of descriptions."""
version = get_version_from_kwargs(prefix, kwargs)
path = get_cache_path(prefix, CacheArtifact.definitions, version=version)
@cached_mapping(
path=path,
header=[f"{prefix}_id", "definition"],
force=check_should_force(kwargs),
cache=check_should_cache(kwargs),
)
def _get_mapping() -> Mapping[str, str]:
logger.info(
"[%s v%s] no cached descriptions found. getting from OBO loader", prefix, version
)
ontology = get_ontology(prefix, **kwargs)
return ontology.get_id_definition_mapping()
return _get_mapping()
[docs]
@wrap_norm_prefix
def get_obsolete(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> set[str]:
"""Get the set of obsolete local unique identifiers."""
version = get_version_from_kwargs(prefix, kwargs)
# TODO pre-cache these!
path = get_cache_path(prefix, CacheArtifact.obsoletes, version=version)
@cached_collection(
path=path,
force=check_should_force(kwargs),
cache=check_should_cache(kwargs),
)
def _get_obsolete() -> list[str]:
ontology = get_ontology(prefix, **kwargs)
return sorted(ontology.get_obsolete())
return set(_get_obsolete())
@wrap_norm_prefix
def get_obsolete_references(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> set[Reference]:
"""Get the set of obsolete references."""
return {
Reference(prefix=prefix, identifier=identifier)
for identifier in get_obsolete(prefix, **kwargs)
}
[docs]
def get_synonyms(
prefix: str | curies.Reference | curies.ReferenceTuple,
identifier: str | None = None,
/,
**kwargs: Unpack[GetOntologyKwargs],
) -> list[str] | None:
"""Get the synonyms for an entity."""
reference = _get_pi(prefix, identifier)
return _help_get(get_id_synonyms_mapping, reference, **kwargs)
[docs]
@wrap_norm_prefix
def get_id_synonyms_mapping(
prefix: str, **kwargs: Unpack[GetOntologyKwargs]
) -> Mapping[str, list[str]]:
"""Get the OBO file and output a synonym dictionary."""
df = get_literal_mappings_df(prefix=prefix, **kwargs)
prefix_with_colon = f"{prefix}:"
prefix_with_colon_len = len(prefix_with_colon)
# keep only literal mappings with the right prefix
df = df[df["curie"].str.startswith(prefix_with_colon)]
return multidict(
(curie[prefix_with_colon_len:], text) for curie, text in df[["curie", "text"]].values
)
[docs]
def get_literal_mappings(
prefix: str, *, skip_obsolete: bool = False, **kwargs: Unpack[GetOntologyKwargs]
) -> list[LiteralMapping]:
"""Get literal mappings."""
df = get_literal_mappings_df(prefix=prefix, **kwargs)
rv = ssslm.df_to_literal_mappings(df, reference_cls=Reference)
if skip_obsolete:
obsoletes = get_obsolete_references(prefix, **kwargs)
rv = [lm for lm in rv if lm.reference not in obsoletes]
return rv
[docs]
def get_literal_mappings_df(
prefix: str,
**kwargs: Unpack[GetOntologyKwargs],
) -> pd.DataFrame:
"""Get a literal mappings dataframe."""
version = get_version_from_kwargs(prefix, kwargs)
path = get_cache_path(prefix, CacheArtifact.literal_mappings, version=version)
@cached_df(
path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs)
)
def _df_getter() -> pd.DataFrame:
return get_ontology(prefix, **kwargs).get_literal_mappings_df()
return _df_getter()