Source code for pyobo.identifier_utils

# -*- coding: utf-8 -*-

"""Utilities for handling prefixes."""

import logging
from functools import wraps
from typing import Optional, Tuple, Union

import bioregistry

from .registries import (
    curie_has_blacklisted_prefix,
    curie_has_blacklisted_suffix,
    curie_is_blacklisted,
    remap_full,
    remap_prefix,
)

__all__ = [
    "normalize_curie",
    "wrap_norm_prefix",
    "standardize_ec",
]

logger = logging.getLogger(__name__)


class MissingPrefix(ValueError):
    """Raised on a missing prefix."""

    def __init__(self, prefix, curie, xref=None, ontology=None):
        self.prefix = prefix
        self.curie = curie
        self.xref = xref
        self.ontology = ontology
        self.reference = None

    def __str__(self) -> str:
        s = ""
        if self.ontology:
            s += f"[{self.ontology}] "
        s += f"unhandled prefix {self.prefix} found in curie {self.curie}"
        if self.xref:
            s += f"/xref {self.xref}"
        if self.reference is not None:
            s += f" from {self.reference.curie}"
        return s


def _normalize_prefix(prefix: str, *, curie=None, xref=None, strict: bool = True) -> Optional[str]:
    """Normalize a namespace and return, if possible."""
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is not None:
        return norm_prefix
    elif strict:
        raise MissingPrefix(prefix=prefix, curie=curie, xref=xref)
    else:
        return None


BAD_CURIES = set()


[docs]def normalize_curie(
    curie: str, *, strict: bool = True
) -> Union[Tuple[str, str], Tuple[None, None]]:
    """Parse a string that looks like a CURIE.

    :param curie: A compact uniform resource identifier (CURIE)
    :param strict: Should an exception be thrown if the CURIE can not be parsed w.r.t. the Bioregistry?
    :return: A parse tuple or a tuple of None, None if not able to parse and not strict

    - Normalizes the namespace
    - Checks against a blacklist for the entire curie, for the namespace, and for suffixes.
    """
    if curie_is_blacklisted(curie):
        return None, None
    if curie_has_blacklisted_prefix(curie):
        return None, None
    if curie_has_blacklisted_suffix(curie):
        return None, None

    # Remap the curie with the full list
    curie = remap_full(curie)

    # Remap node's prefix (if necessary)
    curie = remap_prefix(curie)

    try:
        head_ns, identifier = curie.split(":", 1)
    except ValueError:  # skip nodes that don't look like normal CURIEs
        if curie not in BAD_CURIES:
            BAD_CURIES.add(curie)
            logger.debug(f"could not split CURIE on colon: {curie}")
        return None, None

    # remove redundant prefix
    if identifier.casefold().startswith(f"{head_ns.casefold()}:"):
        identifier = identifier[len(head_ns) + 1 :]

    norm_node_prefix = _normalize_prefix(head_ns, curie=curie, strict=strict)
    if not norm_node_prefix:
        return None, None
    return norm_node_prefix, identifier


def wrap_norm_prefix(f):
    """Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""

    @wraps(f)
    def _wrapped(prefix, *args, **kwargs):
        norm_prefix = bioregistry.normalize_prefix(prefix)
        if norm_prefix is None:
            raise ValueError(f"Invalid prefix: {prefix}")
        return f(norm_prefix, *args, **kwargs)

    return _wrapped


def standardize_ec(ec: str) -> str:
    """Standardize an EC code identifier by removing all trailing dashes and dots."""
    ec = ec.strip()
    for _ in range(4):
        ec = ec.rstrip("-").rstrip(".")
    return ec