# -*- coding: utf-8 -*-
"""Use synonyms from OBO to normalize names."""
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from functools import lru_cache
from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
import bioregistry
from .api import names
from .utils.io import multisetdict
__all__ = [
"ground",
"Normalizer",
"OboNormalizer",
"MultiNormalizer",
"NormalizationResult",
]
logger = logging.getLogger(__name__)
NormalizationSuccess = Tuple[str, str, str]
NormalizationFailure = Tuple[None, None, str]
NormalizationResult = Union[NormalizationSuccess, NormalizationFailure]
class Normalizer(ABC):
"""A normalizer."""
id_to_name: Dict[str, str]
id_to_synonyms: Dict[str, List[str]]
#: A mapping from all synonyms to the set of identifiers that they point to.
#: In a perfect world, each would only be a single element.
synonym_to_identifiers_mapping: Dict[str, Set[str]]
#: A mapping from normalized names to the actual ones that they came from
norm_name_to_name: Dict[str, Set[str]]
def __init__(
self,
id_to_name: Dict[str, str],
id_to_synonyms: Dict[str, List[str]],
remove_prefix: Optional[str] = None,
) -> None: # noqa: D107
"""Initialize the normalizer.
:param id_to_name: An identifier to name dictionary.
:param id_to_synonyms: An identifier to list of synonyms dictionary.
:param remove_prefix: A prefix to be removed from the identifiers. Useful for nomenclatures like ChEBI.
"""
self.id_to_name = id_to_name
self.id_to_synonyms = id_to_synonyms
self.synonym_to_identifiers_mapping = multisetdict(
self._iterate_synonyms_to_identifiers(
id_to_name=self.id_to_name,
id_to_synonyms=self.id_to_synonyms,
remove_prefix=remove_prefix,
)
)
self.norm_name_to_name = self._get_norm_name_to_names(self.synonym_to_identifiers_mapping)
@classmethod
def _get_norm_name_to_names(cls, synonyms: Iterable[str]) -> Dict[str, Set[str]]:
return multisetdict((cls._normalize_text(synonym), synonym) for synonym in synonyms)
@staticmethod
def _normalize_text(text: str) -> str:
text = text.strip().strip('"').strip("'").lower()
text = normalize_dashes(text)
text = text.replace("-", "") # remove all dashes
text = text.replace(" ", "") # remove all spaces
return text
@staticmethod
def _iterate_synonyms_to_identifiers(
*,
id_to_name: Mapping[str, str],
id_to_synonyms: Mapping[str, Iterable[str]],
remove_prefix: Optional[str] = None,
) -> Iterable[Tuple[str, str]]:
if remove_prefix is not None:
remove_prefix = f'{remove_prefix.lower().rstrip(":")}:'
# Add name
for identifier, name in id_to_name.items():
if remove_prefix and identifier.lower().startswith(remove_prefix):
identifier = identifier[len(remove_prefix) :]
yield name, identifier
# Add synonyms
for identifier, synonyms in id_to_synonyms.items():
if remove_prefix and identifier.lower().startswith(remove_prefix):
identifier = identifier[len(remove_prefix) :]
for synonym in synonyms:
# it might overwrite but this is probably always due to alternate ids
yield synonym, identifier
def get_names(self, query: str) -> List[str]:
"""Get all names to which the query text maps."""
norm_text = self._normalize_text(query)
return list(self.norm_name_to_name.get(norm_text, []))
@abstractmethod
def normalize(self, query: str) -> NormalizationResult:
"""Try and normalize a name to a identifier and canonical name."""
raise NotImplementedError
@lru_cache()
def get_normalizer(prefix: str) -> Normalizer:
"""Get an OBO normalizer."""
norm_prefix = bioregistry.normalize_prefix(prefix)
if norm_prefix is None:
raise ValueError(f"unhandled prefix: {prefix}")
logger.info("getting obo normalizer for %s", norm_prefix)
normalizer = OboNormalizer(norm_prefix)
logger.debug(
"normalizer for %s with %s name lookups",
normalizer.prefix,
len(normalizer.norm_name_to_name),
)
return normalizer
[docs]def ground(prefix: Union[str, Iterable[str]], query: str) -> NormalizationResult:
"""Normalize a string given the prefix's labels and synonyms.
:param prefix: If a string, only grounds against that namespace. If a list, will try grounding
against all in that order
:param query: The string to try grounding
"""
if isinstance(prefix, str):
normalizer = get_normalizer(prefix)
return normalizer.normalize(query)
else:
for p in prefix:
norm_prefix, identifier, name = ground(p, query)
if norm_prefix and identifier and name:
return norm_prefix, identifier, name
return None, None, query
[docs]class OboNormalizer(Normalizer):
"""A utility for normalizing by names."""
def __init__(self, prefix: str) -> None: # noqa: D107
self.prefix = prefix
self._len_prefix = len(prefix)
id_to_name = names.get_id_name_mapping(prefix)
id_to_synonyms = names.get_id_synonyms_mapping(prefix)
super().__init__(
id_to_name=dict(id_to_name),
id_to_synonyms=dict(id_to_synonyms),
remove_prefix=prefix,
)
def __repr__(self) -> str: # noqa: D105
return f'OboNormalizer(prefix="{self.prefix}")'
[docs] def normalize(self, query: str) -> NormalizationResult:
"""Try and normalize a name to a identifier and canonical name."""
names = self.get_names(query)
if not names:
return None, None, query
for name in names:
identifiers = self.synonym_to_identifiers_mapping[name]
for identifier in identifiers:
if identifier in self.id_to_name:
return self.prefix, identifier, self.id_to_name[identifier]
logger.warning(f"Could not find valid identifier for {name} from {identifiers}")
# maybe it happens that one can't be found?
logger.warning(f"was able to look up name {query}->{names} but not find fresh identifier")
return None, None, query
@dataclass
class MultiNormalizer:
"""Multiple normalizers together.
If you're looking for taxa of exotic plants, you might use:
>>> from pyobo.normalizer import MultiNormalizer
>>> normalizer = MultiNormalizer(prefixes=['ncbitaxon', 'itis'])
>>> normalizer.normalize('Homo sapiens')
('ncbitaxon', '9606', 'Homo sapiens')
>>> normalizer.normalize('Abies bifolia') # variety not listed in NCBI
('itis', '507501', 'Abies bifolia')
>>> normalizer.normalize('vulcan') # nice try, nerds
(None, None, None)
"""
#: The normalizers for each prefix
normalizers: List[Normalizer]
@staticmethod
def from_prefixes(prefixes: List[str]) -> "MultiNormalizer":
"""Instantiate normalizers based on the given prefixes, in preferred order.."""
return MultiNormalizer([get_normalizer(prefix) for prefix in prefixes])
def normalize(self, query: str) -> NormalizationResult:
"""Try and normalize a canonical name using multiple normalizers."""
for normalizer in self.normalizers:
prefix, identifier, name = normalizer.normalize(query)
if prefix and identifier and name: # all not empty
return prefix, identifier, name
return None, None, query
# See: https://en.wikipedia.org/wiki/Dash
FIGURE_DASH = b"\xe2\x80\x92".decode("utf-8")
EN_DASH = b"\xe2\x80\x93".decode("utf-8")
EM_DASH = b"\xe2\x80\x94".decode("utf-8")
HORIZONAL_BAR = b"\xe2\x80\x95".decode("utf-8")
NORMAL_DASH = "-"
def normalize_dashes(s: str) -> str:
"""Normalize dashes in a string."""
return (
s.replace(FIGURE_DASH, NORMAL_DASH)
.replace(EN_DASH, NORMAL_DASH)
.replace(EM_DASH, NORMAL_DASH)
.replace(HORIZONAL_BAR, NORMAL_DASH)
)