Source code for dstk.parameters.vector_similarity.geometric_measures.dissimilarity

"""
This module provides geometric measures to calculate the dissimilarity between word embeddings.
By representing words as vectors in a high-dimensional space, these methods measure the
spatial "distance" between them; a smaller distance indicates higher semantic similarity,
while a larger distance indicates greater dissimilarity.

Core functionalities include:

* Calculating Euclidean distance (L2 norm) to determine the straight-line distance
  between two word vectors.
* Calculating Manhattan distance (L1 norm) to measure the distance between two
  word vectors along axes at right angles.
* Providing foundational geometric metrics used to evaluate distributional similarity
  and identify nearest neighbors for given lexemes.

The module is intended to provide researchers with standard spatial metrics to quantify
the relationships between words in a vector space.
"""

import numpy as np
from ....lib_types import Series, DataFrame



[docs]
def euclidean_distance(
    embeddings: DataFrame, first_word: str, second_word: str
) -> float:
    """
    Computes the Euclidean distance between the embeddings of two words.

    :param embeddings: A dataframe containing the word embeddings.
    :type embeddings: DataFrame
    :param first_word: The first word in the pair.
    :type first_word: str
    :param second_word: The second word in the pair.
    :type second_word: str

    :returns: The Euclidean distance between the first and second word.
    :rtype: float
    """

    first_word_vector: Series = embeddings.loc[first_word]
    second_word_vector: Series = embeddings.loc[second_word]

    return float(np.linalg.norm(first_word_vector - second_word_vector))




[docs]
def manhattan_distance(
    embeddings: DataFrame, first_word: str, second_word: str
) -> float:
    """
    Computes the Manhattan distance between the embeddings of two words.

    :param embeddings: A dataframe containing the word embeddings.
    :type embeddings: DataFrame
    :param first_word: The first word in the pair.
    :type first_word: str
    :param second_word: The second word in the pair.
    :type second_word: str

    :returns: The Manhattan distance between the first and second word.
    :rtype: float
    """

    first_word_vector: Series = embeddings.loc[first_word]
    second_word_vector: Series = embeddings.loc[second_word]

    return np.sum(np.abs(first_word_vector - second_word_vector))