Source code for dstk.models.count.matrix.classical

"""
This module provides classic matrix-based distributional semantic models for generating word embeddings.
As part of the 'count' model category, these methods determine word meanings by analyzing the frequency
of co-occurrences within a corpus to build large matrices of word counts. 
Documents can be optionally normalized before training through lowercasing, 
lemmatization or stemming, part-of-speech filtering, stop-word removal, and punctuation removal. 
The trained models can then be used to compute semantic similarity, identify nearest neighbors, and
export learned embeddings for downstream analyses.

Core functionalities include:

* Implementing the Standard Model (Lenci & Sahlgren) using co-occurrence matrices and PPMI weighting.
* Implementing Latent Semantic Analysis (LSA) utilizing word-document matrices and TF-IDF weighting.
* Integrated preprocessing pipelines including case normalization, lemmatization/stemming,
  part-of-speech filtering, and stop-word removal.
* Dimensionality reduction using Singular Value Decomposition (SVD) to map words into a
  lower-dimensional semantic space.
* Calculating geometric similarity measures, such as cosine similarity and nearest neighbor identification.

The module is designed to provide foundational methods for extracting semantic relationships from text
based on traditional distributional semantics.
"""

from ...tools import ModelBuilder, DistanceMeasurements

from typing import overload, Literal
from ....lib_types import (
    ReturnParameterGenerator,
    ReturnAllGenerator,
    Document,
    DocumentIndex,
)


@overload
def StandardModel(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    window_size: int = 3,
    n_dimensions: int = 300,
    return_parameters: None = None,
    return_all: Literal[False] = False,
) -> DistanceMeasurements: ...


@overload
def StandardModel(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    window_size: int = 3,
    n_dimensions: int = 300,
    return_parameters: list[str],
    return_all: Literal[False] = False,
) -> ReturnParameterGenerator: ...


@overload
def StandardModel(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    window_size: int = 3,
    n_dimensions: int = 300,
    return_parameters: None = None,
    return_all: Literal[True],
) -> ReturnAllGenerator: ...



[docs]
def StandardModel(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    window_size: int = 3,
    n_dimensions: int = 300,
    return_parameters: list[str] | None = None,
    return_all: bool = False,
) -> ReturnParameterGenerator | ReturnAllGenerator | DistanceMeasurements:
    """
    Generate distributional word embeddings from a single document using the standard model as defined by (Lenci & Sahlgren 97-99).

    The model extracts word co-occurrences within a context window,
    weights the matrix using positive PMI (PPMI), reduces its
    dimensionality with SVD, and provides cosine-based similarity
    measures.

    :param document: Input document.
    :type document: Document
    :param frequency_threshold: Minimum frequency required for a word to
        be included in the vocabulary.
    :type frequency_threshold: int
    :param window_size: Size of the context window.
    :type window_size: int
    :param n_dimensions: Number of dimensions in the reduced semantic space.
    :type n_dimensions: int
    :param return_parameters: Names of workflow steps to return instead of
        the final model output.


        Available values:

        * ``context.selection.unit``
        * ``context.selection.lexical``
        * ``context.extraction.linguistic.word.window``
        * ``co_matrix.creation.linguistic.word.window``
        * ``co_matrix.weighting.associative_measures``
        * ``dimensionality_reduction``
        * ``vector_similarity.geometric_measures.similarity``


    :type return_parameters: list[str] | None
    :param return_all: Return all workflow steps and their outputs.
    :type return_all: bool

    :return: A semantic similarity model, a generator of selected workflow
        results, or a generator of all workflow results.
    :rtype: DistanceMeasurements | ReturnParameterGenerator | ReturnAllGenerator
    """

    Model: ModelBuilder[DistanceMeasurements] = ModelBuilder(
        name="StandardModel",
        workflow={
            "context.selection.unit": [
                {"get_words": {}},
            ],
            "context.selection.lexical": [
                *([{"to_lower": {}}] if lowercase else []),  # type: ignore
                *([{"to_base_form": {"base_form": base_form}}] if base_form else []),  # type: ignore
                *([{"filter_by_pos": {"allowed_pos": allowed_pos}}] if allowed_pos else []),  # type: ignore
                *([{"remove_stop_words": {"language": language, "custom_stop_words": custom_stop_words}}] if remove_stop_words else []),  # type: ignore
                {"filter_by_frequency": {"threshold": frequency_threshold}},
            ],
            "context.extraction.linguistic.word.window": [
                {"extract_ngrams": {"window_size": window_size}}
            ],
            "co_matrix.creation.linguistic.word.window": [
                {"create_word_by_word_matrix": {}}
            ],
            "co_matrix.weighting.associative_measures": [{"pmi": {"positive": True}}],
            "dimensionality_reduction": [{"svd": {"n_dimensions": n_dimensions}}],
            "vector_similarity.geometric_measures.similarity": [
                {"cos_similarity": {}},
                {"nearest_neighbors": {}},
                {"approximate_nearest_neighbors": {}},
            ],
        },
        wrapper=True,
    )

    return Model(
        input_data=document, return_parameters=return_parameters, return_all=return_all
    )



@overload
def LatentSemanticAnalysis(
    document_index: DocumentIndex,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    n_dimensions: int = 300,
    return_parameters: None,
    return_all: Literal[False] = False,
) -> DistanceMeasurements: ...


@overload
def LatentSemanticAnalysis(
    document_index: DocumentIndex,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    n_dimensions: int = 300,
    return_parameters: list[str],
    return_all: Literal[False] = False,
) -> ReturnParameterGenerator: ...


@overload
def LatentSemanticAnalysis(
    document_index: DocumentIndex,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    n_dimensions: int = 300,
    return_parameters: None = None,
    return_all: Literal[True],
) -> ReturnAllGenerator: ...



[docs]
def LatentSemanticAnalysis(
    document_index: DocumentIndex,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    frequency_threshold: int = 50,
    n_dimensions: int = 300,
    return_parameters: list[str] | None = None,
    return_all: bool = False,
) -> ReturnParameterGenerator | ReturnAllGenerator | DistanceMeasurements:
    """
    Generate word embeddings using Latent Semantic Analysis (LSA) as defined by (Lenci & Sahlgren 100-103).

    The model builds a word-document matrix, applies TF-IDF weighting,
    reduces dimensionality with SVD, and provides cosine-based similarity
    measures.

    :param document_index: Mapping of document names to documents.
    :type document_index: DocumentIndex
    :param frequency_threshold: Minimum frequency required for a word to
        be included in the vocabulary.
    :type frequency_threshold: int
    :param n_dimensions: Number of dimensions in the reduced semantic space.
    :type n_dimensions: int
    :param return_parameters: Names of workflow steps to return instead of
        the final model output.


        Available values:

        * ``context.selection.unit``
        * ``context.selection.lexical``
        * ``co_matrix.creation.document``
        * ``co_matrix.weighting.relevance_measures``
        * ``dimensionality_reduction``
        * ``vector_similarity.geometric_measures.similarity``


    :type return_parameters: list[str] | None
    :param return_all: Return all workflow steps and their outputs.
    :type return_all: bool

    :return: A semantic similarity model, a generator of selected workflow
        results, or a generator of all workflow results.
    :rtype: DistanceMeasurements | ReturnParameterGenerator | ReturnAllGenerator
    """

    document_names, documents = zip(*document_index.items())

    Model: ModelBuilder[DistanceMeasurements] = ModelBuilder(
        name="LatentSemanticAnalysis",
        workflow={
            "context.selection.unit": [
                {"get_words": {}},
            ],
            "context.selection.lexical": [
                *([{"to_lower": {}}] if lowercase else []),  # type: ignore
                *([{"to_base_form": {"base_form": base_form}}] if base_form else []),  # type: ignore
                *([{"filter_by_pos": {"allowed_pos": allowed_pos}}] if allowed_pos else []),  # type: ignore
                *([{"remove_stop_words": {"language": language, "custom_stop_words": custom_stop_words}}] if remove_stop_words else []),  # type: ignore
                {"filter_by_frequency": {"threshold": frequency_threshold}},
            ],
            "co_matrix.creation.document": [
                {"create_word_by_document_matrix": {"document_names": document_names}}
            ],
            "co_matrix.weighting.relevance_measures": [{"tf_idf": {}}],
            "dimensionality_reduction": [{"svd": {"n_dimensions": n_dimensions}}],
            "vector_similarity.geometric_measures.similarity": [
                {"cos_similarity": {}},
                {"nearest_neighbors": {}},
                {"approximate_nearest_neighbors": {}},
            ],
        },
        wrapper=True,
    )

    return Model(
        input_data=documents, return_parameters=return_parameters, return_all=return_all
    )