Source code for pyobo.getters

# -*- coding: utf-8 -*-

"""Utilities for OBO files."""

import datetime
import gzip
import json
import logging
import pathlib
import subprocess
import typing
import urllib.error
from collections import Counter
from pathlib import Path
from typing import (
    Callable,
    Iterable,
    List,
    Mapping,
    Optional,
    Sequence,
    Set,
    Tuple,
    TypeVar,
    Union,
)

import bioregistry
from tqdm.auto import tqdm

from .constants import DATABASE_DIRECTORY
from .identifier_utils import MissingPrefix, wrap_norm_prefix
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
from .struct import Obo
from .utils.io import get_writer
from .utils.path import ensure_path, prefix_directory_join
from .version import get_git_hash, get_version

__all__ = [
    "get_ontology",
    "NoBuild",
]

logger = logging.getLogger(__name__)


class NoBuild(RuntimeError):
    """Base exception for being unable to build."""


class UnhandledFormat(NoBuild):
    """Only OWL is available."""


#: The following prefixes can not be loaded through ROBOT without
#: turning off integrity checks
REQUIRES_NO_ROBOT_CHECK = {"clo"}


[docs]@wrap_norm_prefix def get_ontology( prefix: str, *, force: bool = False, rewrite: bool = False, strict: bool = True, version: Optional[str] = None, robot_check: bool = True, ) -> Obo: """Get the OBO for a given graph. :param prefix: The prefix of the ontology to look up :param version: The pre-looked-up version of the ontology :param force: Download the data again :param rewrite: Should the OBO cache be rewritten? Automatically set to true if ``force`` is true :param strict: Should CURIEs be treated strictly? If true, raises exceptions on invalid/malformed :param robot_check: If set to false, will send the ``--check=false`` command to ROBOT to disregard malformed ontology components. Necessary to load some ontologies like VO. :returns: An OBO object :raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource. Alternate usage if you have a custom url:: >>> from pystow.utils import download >>> from pyobo import Obo, from_obo_path >>> url = ... >>> obo_path = ... >>> download(url=url, path=path) >>> obo = from_obo_path(path) """ if force: rewrite = True if prefix == "uberon": logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing") strict = False obonet_json_gz_path = prefix_directory_join( prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version ) if obonet_json_gz_path.exists() and not force: from .reader import from_obonet from .utils.cache import get_gzipped_graph logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path) return from_obonet(get_gzipped_graph(obonet_json_gz_path)) if has_nomenclature_plugin(prefix): obo = run_nomenclature_plugin(prefix, version=version) logger.debug("[%s] caching nomenclature plugin", prefix) obo.write_default(force=rewrite) return obo logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path) ontology_format, path = _ensure_ontology_path(prefix, force=force, version=version) if path is None: raise NoBuild elif ontology_format == "obo": pass # all gucci elif ontology_format == "owl": from bioontologies import robot _converted_obo_path = path.with_suffix(".obo") if prefix in REQUIRES_NO_ROBOT_CHECK: robot_check = False robot.convert(path, _converted_obo_path, check=robot_check) path = _converted_obo_path else: raise UnhandledFormat(f"[{prefix}] unhandled ontology file format: {path.suffix}") from .reader import from_obo_path obo = from_obo_path(path, prefix=prefix, strict=strict) if version is not None: if obo.data_version is None: logger.warning("[%s] did not have a version, overriding with %s", obo.ontology, version) obo.data_version = version elif obo.data_version != version: logger.warning( "[%s] had version %s, overriding with %s", obo.ontology, obo.data_version, version ) obo.data_version = version obo.write_default(force=rewrite) return obo
def _ensure_ontology_path( prefix: str, force, version ) -> Union[Tuple[str, Path], Tuple[None, None]]: for ontology_format, url in [ # noqa:B007 ("obo", bioregistry.get_obo_download(prefix)), ("owl", bioregistry.get_owl_download(prefix)), ("json", bioregistry.get_json_download(prefix)), ]: if url is not None: try: path = Path(ensure_path(prefix, url=url, force=force, version=version)) except urllib.error.HTTPError: continue else: return ontology_format, path return None, None #: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?) CANT_PARSE = { "agro", "aro", "bco", "caro", "cco", "chmo", "cido", "covoc", "cto", "cvdo", "dicom", "dinto", "emap", "epso", "eupath", "fbbi", "fma", "fobi", "foodon", "genepio", "hancestro", "hom", "hso", "htn", # Unknown string format: creation: 16MAY2017 "ico", "idocovid19", "labo", "mamo", "mfmo", "mfo", "mfomd", "miapa", "mo", "oae", "ogms", # Unknown string format: creation: 16MAY2017 "ohd", "ons", "oostt", "opmi", "ornaseq", "orth", "pdro", "probonto", "psdo", "reo", "rex", "rnao", "sepio", "sio", "spd", "sweetrealm", "txpo", "vido", "vt", "xl", } SKIP = { "ncbigene", # too big, refs acquired from other dbs "pubchem.compound", # to big, can't deal with this now "gaz", # Gazetteer is irrelevant for biology "ma", # yanked "bila", # yanked # FIXME below "emapa", # recently changed with EMAP... not sure what the difference is anymore "kegg.genes", "kegg.genome", "kegg.pathway", # URL is wrong "ensemblglossary", # Too much junk "biolink", } X = TypeVar("X") def iter_helper( f: Callable[[str], Mapping[str, X]], leave: bool = False, strict: bool = True, **kwargs, ) -> Iterable[Tuple[str, str, X]]: """Yield all mappings extracted from each database given.""" for prefix, mapping in iter_helper_helper(f, strict=strict, **kwargs): it = tqdm( mapping.items(), desc=f"iterating {prefix}", leave=leave, unit_scale=True, disable=None, ) for key, value in it: value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ") if value: yield prefix, key, value def _prefixes( skip_below: Optional[str] = None, skip_below_inclusive: bool = True, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, ) -> Iterable[str]: for prefix, resource in sorted(bioregistry.read_registry().items()): if resource.no_own_terms: continue if prefix in SKIP: tqdm.write(f"skipping {prefix} because in default skip set") continue if skip_set and prefix in skip_set: tqdm.write(f"skipping {prefix} because in skip set") continue if skip_below is not None: if skip_below_inclusive: if prefix < skip_below: continue else: if prefix <= skip_below: continue has_pyobo = has_nomenclature_plugin(prefix) has_download = resource.has_download() if skip_pyobo and has_pyobo: continue if not has_pyobo and not has_download: continue yield prefix def iter_helper_helper( f: Callable[[str], X], use_tqdm: bool = True, skip_below: Optional[str] = None, skip_below_inclusive: bool = True, skip_pyobo: bool = False, skip_set: Optional[Set[str]] = None, strict: bool = True, **kwargs, ) -> Iterable[Tuple[str, X]]: """Yield all mappings extracted from each database given. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param use_tqdm: If true, use the tqdm progress bar :param skip_below: If true, skip sources whose names are less than this (used for iterative curation :param skip_pyobo: If true, skip sources implemented in PyOBO :param skip_set: A pre-defined blacklist to skip :param strict: If true, will raise exceptions and crash the program instead of logging them. :param kwargs: Keyword arguments passed to ``f``. :yields: A prefix and the result of the callable ``f`` :raises TypeError: If a type error is raised, it gets re-raised :raises urllib.error.HTTPError: If the resource could not be downloaded :raises urllib.error.URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ prefixes = list( _prefixes( skip_set=skip_set, skip_below=skip_below, skip_pyobo=skip_pyobo, skip_below_inclusive=skip_below_inclusive, ) ) prefix_it = tqdm( prefixes, disable=not use_tqdm, desc=f"Building with {f.__name__}()", unit="resource" ) for prefix in prefix_it: prefix_it.set_postfix(prefix=prefix) try: yv = f(prefix, **kwargs) # type:ignore except urllib.error.HTTPError as e: logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl()) if strict and not bioregistry.is_deprecated(prefix): raise except urllib.error.URLError: logger.warning("[%s] unable to download", prefix) if strict and not bioregistry.is_deprecated(prefix): raise except MissingPrefix as e: logger.warning("[%s] missing prefix: %s", prefix, e) if strict and not bioregistry.is_deprecated(prefix): raise e except subprocess.CalledProcessError: logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix) except UnhandledFormat as e: logger.warning("[%s] %s", prefix, e) except ValueError as e: if _is_xml(e): # this means that it tried doing parsing on an xml page logger.info( "no resource available for %s. See http://www.obofoundry.org/ontology/%s", prefix, prefix, ) else: logger.exception( "[%s] got exception %s while parsing", prefix, e.__class__.__name__ ) except TypeError as e: logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__) if strict: raise e else: yield prefix, yv def _is_xml(e) -> bool: return str(e).startswith("Tag-value pair parsing failed for:") or str(e).startswith( 'Tag-value pair parsing failed for:\n<?xml version="1.0" encoding="UTF-8"?>' ) def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path: if directory is None: rv = DATABASE_DIRECTORY elif isinstance(directory, str): rv = pathlib.Path(directory) elif isinstance(directory, pathlib.Path): rv = directory else: raise TypeError rv.mkdir(parents=True, exist_ok=True) return rv def db_output_helper( f: Callable[..., Iterable[Tuple[str, ...]]], db_name: str, columns: Sequence[str], *, directory: Union[None, str, pathlib.Path] = None, strict: bool = True, use_gzip: bool = True, summary_detailed: Optional[Sequence[int]] = None, **kwargs, ) -> List[pathlib.Path]: """Help output database builds. :param f: A function that takes a prefix and gives back something that will be used by an outer function. :param db_name: name of the output resource (e.g., "alts", "names") :param columns: The names of the columns :param directory: The directory to output everything, or defaults to :data:`pyobo.constants.DATABASE_DIRECTORY`. :param strict: Passed to ``f`` by keyword :param kwargs: Passed to ``f`` by splat :returns: A sequence of paths that got created. """ directory = _prep_dir(directory) c: typing.Counter[str] = Counter() c_detailed: typing.Counter[Tuple[str, ...]] = Counter() if use_gzip: db_path = directory.joinpath(f"{db_name}.tsv.gz") else: db_path = directory.joinpath(f"{db_name}.tsv") db_sample_path = directory.joinpath(f"{db_name}_sample.tsv") db_summary_path = directory.joinpath(f"{db_name}_summary.tsv") db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv") logger.info("writing %s to %s", db_name, db_path) logger.info("writing %s sample to %s", db_name, db_sample_path) it = f(strict=strict, **kwargs) with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file: writer = get_writer(gzipped_file) # for the first 10 rows, put it in a sample file too with open(db_sample_path, "w") as sample_file: sample_writer = get_writer(sample_file) # write header writer.writerow(columns) sample_writer.writerow(columns) for row, _ in zip(it, range(10)): c[row[0]] += 1 if summary_detailed is not None: c_detailed[tuple(row[i] for i in summary_detailed)] += 1 writer.writerow(row) sample_writer.writerow(row) # continue just in the gzipped one for row in it: c[row[0]] += 1 if summary_detailed is not None: c_detailed[tuple(row[i] for i in summary_detailed)] += 1 writer.writerow(row) logger.info(f"writing {db_name} summary to {db_summary_path}") with open(db_summary_path, "w") as file: writer = get_writer(file) writer.writerows(c.most_common()) if summary_detailed is not None: logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}") with open(db_summary_detailed_path, "w") as file: writer = get_writer(file) writer.writerows((*keys, v) for keys, v in c_detailed.most_common()) db_metadata_path = directory.joinpath(f"{db_name}_metadata.json") with open(db_metadata_path, "w") as file: json.dump( { "version": get_version(), "git_hash": get_git_hash(), "date": datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"), "count": sum(c.values()), }, file, indent=2, ) rv: List[pathlib.Path] = [ db_metadata_path, db_path, db_sample_path, db_summary_path, ] if summary_detailed: rv.append(db_summary_detailed_path) return rv