Source code for pyobo.normalizer

# -*- coding: utf-8 -*-

"""Use synonyms from OBO to normalize names."""

import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from functools import lru_cache
from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union

import bioregistry

from .api import names
from .utils.io import multisetdict

__all__ = [
    "ground",
    "Normalizer",
    "OboNormalizer",
    "MultiNormalizer",
    "NormalizationResult",
]

logger = logging.getLogger(__name__)

NormalizationSuccess = Tuple[str, str, str]
NormalizationFailure = Tuple[None, None, str]
NormalizationResult = Union[NormalizationSuccess, NormalizationFailure]


class Normalizer(ABC):
    """A normalizer."""

    id_to_name: Dict[str, str]
    id_to_synonyms: Dict[str, List[str]]

    #: A mapping from all synonyms to the set of identifiers that they point to.
    #: In a perfect world, each would only be a single element.
    synonym_to_identifiers_mapping: Dict[str, Set[str]]
    #: A mapping from normalized names to the actual ones that they came from
    norm_name_to_name: Dict[str, Set[str]]

    def __init__(
        self,
        id_to_name: Dict[str, str],
        id_to_synonyms: Dict[str, List[str]],
        remove_prefix: Optional[str] = None,
    ) -> None:  # noqa: D107
        """Initialize the normalizer.

        :param id_to_name: An identifier to name dictionary.
        :param id_to_synonyms: An identifier to list of synonyms dictionary.
        :param remove_prefix: A prefix to be removed from the identifiers. Useful for nomenclatures like ChEBI.
        """
        self.id_to_name = id_to_name
        self.id_to_synonyms = id_to_synonyms
        self.synonym_to_identifiers_mapping = multisetdict(
            self._iterate_synonyms_to_identifiers(
                id_to_name=self.id_to_name,
                id_to_synonyms=self.id_to_synonyms,
                remove_prefix=remove_prefix,
            )
        )
        self.norm_name_to_name = self._get_norm_name_to_names(self.synonym_to_identifiers_mapping)

    @classmethod
    def _get_norm_name_to_names(cls, synonyms: Iterable[str]) -> Dict[str, Set[str]]:
        return multisetdict((cls._normalize_text(synonym), synonym) for synonym in synonyms)

    @staticmethod
    def _normalize_text(text: str) -> str:
        text = text.strip().strip('"').strip("'").lower()
        text = normalize_dashes(text)
        text = text.replace("-", "")  # remove all dashes
        text = text.replace(" ", "")  # remove all spaces
        return text

    @staticmethod
    def _iterate_synonyms_to_identifiers(
        *,
        id_to_name: Mapping[str, str],
        id_to_synonyms: Mapping[str, Iterable[str]],
        remove_prefix: Optional[str] = None,
    ) -> Iterable[Tuple[str, str]]:
        if remove_prefix is not None:
            remove_prefix = f'{remove_prefix.lower().rstrip(":")}:'

        # Add name
        for identifier, name in id_to_name.items():
            if remove_prefix and identifier.lower().startswith(remove_prefix):
                identifier = identifier[len(remove_prefix) :]

            yield name, identifier

        # Add synonyms
        for identifier, synonyms in id_to_synonyms.items():
            if remove_prefix and identifier.lower().startswith(remove_prefix):
                identifier = identifier[len(remove_prefix) :]

            for synonym in synonyms:
                # it might overwrite but this is probably always due to alternate ids
                yield synonym, identifier

    def get_names(self, query: str) -> List[str]:
        """Get all names to which the query text maps."""
        norm_text = self._normalize_text(query)
        return list(self.norm_name_to_name.get(norm_text, []))

    @abstractmethod
    def normalize(self, query: str) -> NormalizationResult:
        """Try and normalize a name to a identifier and canonical name."""
        raise NotImplementedError


@lru_cache()
def get_normalizer(prefix: str) -> Normalizer:
    """Get an OBO normalizer."""
    norm_prefix = bioregistry.normalize_prefix(prefix)
    if norm_prefix is None:
        raise ValueError(f"unhandled prefix: {prefix}")
    logger.info("getting obo normalizer for %s", norm_prefix)
    normalizer = OboNormalizer(norm_prefix)
    logger.debug(
        "normalizer for %s with %s name lookups",
        normalizer.prefix,
        len(normalizer.norm_name_to_name),
    )
    return normalizer


[docs]def ground(prefix: Union[str, Iterable[str]], query: str) -> NormalizationResult: """Normalize a string given the prefix's labels and synonyms. :param prefix: If a string, only grounds against that namespace. If a list, will try grounding against all in that order :param query: The string to try grounding """ if isinstance(prefix, str): normalizer = get_normalizer(prefix) return normalizer.normalize(query) else: for p in prefix: norm_prefix, identifier, name = ground(p, query) if norm_prefix and identifier and name: return norm_prefix, identifier, name return None, None, query
[docs]class OboNormalizer(Normalizer): """A utility for normalizing by names.""" def __init__(self, prefix: str) -> None: # noqa: D107 self.prefix = prefix self._len_prefix = len(prefix) id_to_name = names.get_id_name_mapping(prefix) id_to_synonyms = names.get_id_synonyms_mapping(prefix) super().__init__( id_to_name=dict(id_to_name), id_to_synonyms=dict(id_to_synonyms), remove_prefix=prefix, ) def __repr__(self) -> str: # noqa: D105 return f'OboNormalizer(prefix="{self.prefix}")'
[docs] def normalize(self, query: str) -> NormalizationResult: """Try and normalize a name to a identifier and canonical name.""" names = self.get_names(query) if not names: return None, None, query for name in names: identifiers = self.synonym_to_identifiers_mapping[name] for identifier in identifiers: if identifier in self.id_to_name: return self.prefix, identifier, self.id_to_name[identifier] logger.warning(f"Could not find valid identifier for {name} from {identifiers}") # maybe it happens that one can't be found? logger.warning(f"was able to look up name {query}->{names} but not find fresh identifier") return None, None, query
@dataclass class MultiNormalizer: """Multiple normalizers together. If you're looking for taxa of exotic plants, you might use: >>> from pyobo.normalizer import MultiNormalizer >>> normalizer = MultiNormalizer(prefixes=['ncbitaxon', 'itis']) >>> normalizer.normalize('Homo sapiens') ('ncbitaxon', '9606', 'Homo sapiens') >>> normalizer.normalize('Abies bifolia') # variety not listed in NCBI ('itis', '507501', 'Abies bifolia') >>> normalizer.normalize('vulcan') # nice try, nerds (None, None, None) """ #: The normalizers for each prefix normalizers: List[Normalizer] @staticmethod def from_prefixes(prefixes: List[str]) -> "MultiNormalizer": """Instantiate normalizers based on the given prefixes, in preferred order..""" return MultiNormalizer([get_normalizer(prefix) for prefix in prefixes]) def normalize(self, query: str) -> NormalizationResult: """Try and normalize a canonical name using multiple normalizers.""" for normalizer in self.normalizers: prefix, identifier, name = normalizer.normalize(query) if prefix and identifier and name: # all not empty return prefix, identifier, name return None, None, query # See: https://en.wikipedia.org/wiki/Dash FIGURE_DASH = b"\xe2\x80\x92".decode("utf-8") EN_DASH = b"\xe2\x80\x93".decode("utf-8") EM_DASH = b"\xe2\x80\x94".decode("utf-8") HORIZONAL_BAR = b"\xe2\x80\x95".decode("utf-8") NORMAL_DASH = "-" def normalize_dashes(s: str) -> str: """Normalize dashes in a string.""" return ( s.replace(FIGURE_DASH, NORMAL_DASH) .replace(EN_DASH, NORMAL_DASH) .replace(EM_DASH, NORMAL_DASH) .replace(HORIZONAL_BAR, NORMAL_DASH) )