Source code for pyobo.api.embedding

"""Embeddings for entities."""

from __future__ import annotations

import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, cast

import bioregistry
import curies
import numpy as np
import pandas as pd
from pystow import get_sentence_transformer
from tqdm import tqdm
from typing_extensions import Unpack

from pyobo.api.edges import get_edges_df
from pyobo.api.names import get_definition, get_id_name_mapping, get_name
from pyobo.api.utils import get_version_from_kwargs
from pyobo.constants import GetOntologyKwargs, check_should_force
from pyobo.identifier_utils import wrap_norm_prefix
from pyobo.utils.path import CacheArtifact, get_cache_path

if TYPE_CHECKING:
    import sentence_transformers

__all__ = [
    "get_graph_embeddings_df",
    "get_text_embedding",
    "get_text_embedding_similarity",
    "get_text_embeddings_df",
]


def _get_text(
    reference: str | curies.Reference | curies.ReferenceTuple,
    /,
    *,
    name: str | None = None,
    **kwargs: Unpack[GetOntologyKwargs],
) -> str | None:
    if name is None:
        name = get_name(reference, **kwargs)
    if name is None:
        return None
    description = get_definition(reference, **kwargs)
    if description:
        name += " " + description
    return name


[docs] def get_graph_embeddings_df( prefix: str, *, method: Literal["pykeen", "grape"] | None = None, epochs: int = 30, dimension: int = 32, **kwargs: Unpack[GetOntologyKwargs], ) -> pd.DataFrame: """Get graph machine learning embeddings.""" if method == "pykeen" or method is None: from pykeen.models import PairRE from pykeen.training import SLCWATrainingLoop from pykeen.triples import TriplesFactory from torch.optim import Adam triples_df = get_edges_df(prefix, **kwargs) training = TriplesFactory.from_labeled_triples(triples_df.values) model = PairRE(triples_factory=training, embedding_dim=dimension) optimizer = Adam(params=model.get_grad_params()) training_loop = SLCWATrainingLoop( model=model, triples_factory=training, optimizer=optimizer ) # can also set batch size here training_loop.train(triples_factory=training, num_epochs=epochs) embeddings = model.entity_representations[0]() df = pd.DataFrame( embeddings.detach().numpy(), index=[training.entity_id_to_label[i] for i in range(embeddings.shape[0])], ) elif method == "grape": from ensmallen import Graph edges_df = get_edges_df(prefix, **kwargs) with tempfile.TemporaryDirectory() as d: path = Path(d).joinpath("test.tsv") edges_df[[":START_ID", ":END_ID"]].to_csv(path, header=None, sep="\t", index=False) graph = Graph.from_csv( edge_path=str(path), edge_list_separator="\t", sources_column_number=0, destinations_column_number=1, edge_list_numeric_node_ids=False, directed=True, name=bioregistry.get_name(prefix, strict=True), verbose=True, ) graph = graph.remove_disconnected_nodes() from embiggen.embedders.ensmallen_embedders.second_order_line import ( SecondOrderLINEEnsmallen, ) embedding = SecondOrderLINEEnsmallen(embedding_size=dimension, epochs=epochs).fit_transform( graph ) df = embedding.get_all_node_embedding()[0].sort_index() # df.columns = [str(c) for c in df.columns] else: raise ValueError(f"invalid graph machine learning method: {method}") df.index.name = "curie" return df
EMBEDDING_INDEX_NAME = "luid" EMBEDDING_DIMENSIONALITY = 384
[docs] @wrap_norm_prefix def get_text_embeddings_df( prefix: str, *, model: sentence_transformers.SentenceTransformer | None = None, **kwargs: Unpack[GetOntologyKwargs], ) -> pd.DataFrame: """Get embeddings for all entities in the resource. :param prefix: A reference, either as a string or Reference object :param model: A sentence transformer model. Defaults to ``all-MiniLM-L6-v2`` if not given. :param kwargs: The keyword arguments to forward to ontology getter functions for names, definitions, and version :returns: A pandas dataframe with an index representing local unique identifiers and columns for the values of the model returned vectors """ path = get_cache_path( prefix, CacheArtifact.embeddings, version=get_version_from_kwargs(prefix, kwargs) ) if path.is_file() and not check_should_force(kwargs): # make an explicit dictionary so we make sure that the index column # doesn't also get interpreted as a float. This would silently be an # issue for any identifier space that has number-looking identifier patterns dtype: dict[str, Any] = {str(i): float for i in range(EMBEDDING_DIMENSIONALITY)} dtype[EMBEDDING_INDEX_NAME] = str df = pd.read_csv(path, sep="\t", dtype=dtype, index_col=0) if df.index.name != EMBEDDING_INDEX_NAME: df.index.name = EMBEDDING_INDEX_NAME df.index = df.index.astype(str) return df id_to_name = get_id_name_mapping(prefix, **kwargs) luids, texts = [], [] for identifier, name in tqdm(id_to_name.items(), desc=f"[{prefix}] constructing text"): text = _get_text(curies.ReferenceTuple(prefix, identifier), name=name, **kwargs) if text is None: continue luids.append(identifier) texts.append(text) if model is None: model = get_sentence_transformer() res = model.encode(texts, show_progress_bar=True) df = pd.DataFrame(res, index=luids) df.index.name = EMBEDDING_INDEX_NAME df.to_csv(path, sep="\t") # index is important here! return df
[docs] def get_text_embedding( reference: str | curies.Reference | curies.ReferenceTuple, *, model: sentence_transformers.SentenceTransformer | None = None, ) -> np.ndarray[tuple[int], np.dtype[np.float64]] | None: """Get a text embedding for an entity, or return none if no text is available. :param reference: A reference, either as a string or Reference object :param model: A sentence transformer model. Defaults to ``all-MiniLM-L6-v2`` if not given. :returns: A 1D numpy float array of embeddings from :class:`sentence_transformers` .. code-block:: python import pyobo embedding = pyobo.get_text_embedding("GO:0000001") # [-5.68335280e-02 7.96175096e-03 -3.36112119e-02 2.34440481e-03 ... ] If you want to do multiple operations, load up the model for reuse .. code-block:: python import pyobo from pyobo.api.embedding import get_text_embedding_model model = get_text_embedding_model() embedding = pyobo.get_text_embedding("GO:0000001", model=model) # [-5.68335280e-02 7.96175096e-03 -3.36112119e-02 2.34440481e-03 ... ] """ text = _get_text(reference) if text is None: return None if model is None: model = get_sentence_transformer() res = model.encode([text]) return cast(np.ndarray[tuple[int], np.dtype[np.float64]], res[0])
[docs] def get_text_embedding_similarity( reference_1: str | curies.Reference | curies.ReferenceTuple, reference_2: str | curies.Reference | curies.ReferenceTuple, *, model: sentence_transformers.SentenceTransformer | None = None, ) -> float | None: """Get the pairwise similarity. :param reference_1: A reference, given as a string or Reference object :param reference_2: A second reference :param model: A sentence transformer model. Defaults to ``all-MiniLM-L6-v2`` if not given. :returns: A floating point similarity, if text is available for both references, otherwise none .. code-block:: python import pyobo similarity = pyobo.get_text_embedding_similarity("GO:0000001", "GO:0000004") # 0.24702128767967224 If you want to do multiple operations, load up the model for reuse .. code-block:: python import pyobo from pyobo.api.embedding import get_text_embedding_model model = get_text_embedding_model() similarity = pyobo.get_text_embedding_similarity("GO:0000001", "GO:0000004", model=model) # 0.24702128767967224 """ if model is None: model = get_sentence_transformer() e1 = get_text_embedding(reference_1, model=model) e2 = get_text_embedding(reference_2, model=model) if e1 is None or e2 is None: return None return cast(float, model.similarity(e1, e2)[0][0].item())