Source code for dstk.modules.geometric_distance

"""
This module provides functions to compute geometric distance and similarity measures  between word embeddings, enabling semantic comparison of words in vector space.

Available metrics include:

* Euclidean distance
* Manhattan distance
* Cosine similarity

Additionally, it offers a method to find the nearest semantic neighbors of a given word based on specified distance or similarity metrics using scikit-learn's NearestNeighbors.

All functions operate on word embeddings represented as Pandas DataFrames indexed by words, facilitating easy integration with common NLP and Computational Linguistic workflows.
"""

from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from faiss import IndexLSH, IndexHNSWFlat, IndexIVFFlat, IndexFlatL2

from ..lib_types import ndarray, Series, DataFrame, Neighbors, Neighbor

[docs] def euclidean_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float: """ Computes the Euclidean distance between the embeddings of two words. :param embeddings: A dataframe containing the word embeddings. :type embeddings: DataFrame :param first_word: The first word in the pair. :type first_word: str :param second_word: The second word in the pair. :type second_word: str :returns: The Euclidean distance between the first and second word. :rtype: float """ first_word_vector: Series = embeddings.loc[first_word] second_word_vector: Series = embeddings.loc[second_word] return float(np.linalg.norm(first_word_vector - second_word_vector))
[docs] def manhattan_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float: """ Computes the Manhattan distance between the embeddings of two words. :param embeddings: A dataframe containing the word embeddings. :type embeddings: DataFrame :param first_word: The first word in the pair. :type first_word: str :param second_word: The second word in the pair. :type second_word: str :returns: The Manhattan distance between the first and second word. :rtype: float """ first_word_vector: Series = embeddings.loc[first_word] second_word_vector: Series = embeddings.loc[second_word] return np.sum(np.abs(first_word_vector - second_word_vector))
[docs] def cos_similarity(embeddings: DataFrame, first_word: str, second_word: str) -> float: """ Computes the cosine similarity between the embeddings of two words. :param embeddings: A dataframe containing the word embeddings. :type embeddings: DataFrame :param first_word: The first word in the pair. :type first_word: str :param second_word: The second word in the pair. :type second_word: str :returns: The cosine similarity between the first and second word. :rtype: float """ first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1) second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1) cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector) return cos_sim[0][0]
[docs] def nearest_neighbors(embeddings: DataFrame, word: str, metric: str = "cosine", n_words: int = 5, **kwargs) -> Neighbors: """ Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric. :param embeddings: A dataframe containing the word embeddings. :type embeddings: DataFrame :param word: The target word to find neighbors for. :type word: str :param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean'). Defaults to 'cosine'. :type metric: str :param n_words: Number of nearest neighbors to return. Defaults to 5. :type of n_words: int :param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html :returns: A list of `Neighbor` namedtuples, one for each word close to the target word. :rtype: Neighbors """ embeddings_array: ndarray = embeddings.to_numpy() word_series: Series = embeddings.loc[word] word_vector: ndarray = np.array(word_series).reshape(1, -1) distances: ndarray indices: ndarray neighbors: NearestNeighbors = NearestNeighbors(n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs) neighbors.fit(embeddings_array) distances, indices = neighbors.kneighbors(word_vector, n_neighbors=n_words + 1) neighbor_tuples = zip(indices[0], distances[0]) results: Neighbors = [Neighbor(embeddings.index[index], 1 - distance if metric == "cosine" else distance) for index, distance in neighbor_tuples if embeddings.index[index] != word] return results
[docs] def approximate_nearest_neighbors(embeddings: DataFrame, word: str, metric: str = "ivf", n_words: int = 5, n_centroids: int = 100, clusters_to_search: int = 10, n_connections: int = 16, search_depth: int = 8, construction_depth: int = 64): """ Find words with similar embeddings using a fast, memory-efficient approximate search. This function returns the closest words to a target word without checking every possible word directly. Instead, it uses structures that give very close results much faster than an exact search, especially on large embedding sets. :param embeddings: A dataframe containing the word embeddings. :type embeddings: DataFrame :param word: The target word to find neighbors for. :type word: str :param metric: The search methods to use (IVF or HNSW). IVF uses less memory; HNSW is more accurate but heavier. Defaults to 'ivf'. :type metric: str :param n_words: Number of nearest neighbors to return. Defaults to 5. :type of n_words: int :param n_centroids: The number of centroids IVF uses to group the embeddings. Equivalent to the number of clusters. Defaults to 100. :type n_centroids: int :param clusters_to_search: Controls how many clusters (centroids) IVF visits during the search. The higher, the slower, but more accurate. Defaults to 10. :type clusters_to_search: int :param n_connections: Number of connections of each node in HNSW. Defaults to 16. :type n_connections: int :param search_depth: How much of the network HNSW will search. Defaults to 8. :type search_depth: int :param construction_depth: How muhc of the network you will search during its construction (how accurately the network is built) (Won't make any difference in search time, so it is better to use higher numbers. However, it does have an effect in construction time). Defaults to 64. :type construction_depth: int """ embeddings_array: ndarray = embeddings.to_numpy() n_dimensions: int = embeddings_array.shape[1] word_series: Series = embeddings.loc[word] word_vector: ndarray = np.array(word_series).reshape(1, -1) distances: ndarray indices: ndarray if metric == "hnsw": index = IndexHNSWFlat(n_dimensions, n_connections) index.hnsw.efSearch = search_depth index.hnsw.efConstruction = construction_depth elif metric == "ivf": quantizer: IndexFlatL2 = IndexFlatL2(n_dimensions) # Clustering method. index = IndexIVFFlat(quantizer, n_dimensions, n_centroids) index.train(embeddings_array) # Trains the index so it can create the clusters index.nprobe = clusters_to_search else: raise RuntimeError(f"Metric '{metric}' is not implemented in approximate_nearest_neighbors") index.add(embeddings_array) distances, indices = index.search(word_vector, n_words) neighbor_tuples = zip(indices[0], distances[0]) results: Neighbors = [Neighbor(embeddings.index[index], distance) for index, distance in neighbor_tuples if embeddings.index[index] != word] return results