Source code for dstk.models.predict.word2vec

"""
This module provides predictive distributional semantic models for learning
word embeddings from linguistically preprocessed corpora.

It implements neural network-based distributional semantic models that learn
dense vector representations of words by predicting lexical contexts rather
than counting co-occurrences. Documents can be optionally normalized before
training through lowercasing, lemmatization or stemming, part-of-speech
filtering, stop-word removal, and punctuation removal. The trained models can
then be used to compute semantic similarity, identify nearest neighbors, and
export learned embeddings for downstream analyses.

Core functionalities include:
* Training Skip-Gram with Negative Sampling (SGNS) word embedding models
* Training FastText word embedding models with subword information
* Applying optional linguistic preprocessing before model training
* Converting trained embedding models into tabular representations
* Computing cosine similarity between word vectors
* Retrieving exact and approximate nearest semantic neighbors
* Returning intermediate workflow outputs for inspection or reuse

The module is intended to provide predictive (neural network-based)
distributional semantic models for lexical semantic analysis in digital
humanities and corpus linguistics workflows.
"""

from gensim.models import Word2Vec
import fasttext
from tempfile import NamedTemporaryFile

from ..tools import ModelBuilder, DistanceMeasurements
from ...hooks.data_conversion import ModelToDataframe, SequencesToString
from ...hooks.tools import Hook

from typing import Literal, overload
from ...lib_types import ReturnAllGenerator, ReturnParameterGenerator, Document


@overload
def SGNS(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 300,
    n_negative_samples: int = 5,
    word_probability_distribution: float = 0.75,
    subsampling: float = 1e-5,
    frequency_threshold: int = 5,
    return_parameters: None = None,
    return_all: Literal[False] = False,
    **kwargs: dict[str, object],
) -> DistanceMeasurements: ...


@overload
def SGNS(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 300,
    n_negative_samples: int = 5,
    word_probability_distribution: float = 0.75,
    subsampling: float = 1e-5,
    frequency_threshold: int = 5,
    return_parameters: list[str],
    return_all: Literal[False] = False,
    **kwargs: dict[str, object],
) -> ReturnParameterGenerator: ...


@overload
def SGNS(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 300,
    n_negative_samples: int = 5,
    word_probability_distribution: float = 0.75,
    subsampling: float = 1e-5,
    frequency_threshold: int = 5,
    return_parameters: None = None,
    return_all: Literal[True],
    **kwargs: dict[str, object],
) -> ReturnAllGenerator: ...



[docs]
def SGNS(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 300,
    n_negative_samples: int = 5,
    word_probability_distribution: float = 0.75,
    subsampling: float = 1e-5,
    frequency_threshold: int = 5,
    return_parameters: list[str] | None = None,
    return_all: bool = False,
    **kwargs: dict[str, object],
) -> ReturnParameterGenerator | ReturnAllGenerator | DistanceMeasurements:
    """
    Generates word embeddings with Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162-163).

    The document is optionally normalized (lowercasing, lemmatization/stemming,
    POS filtering, stop-word removal) before training a ``Word2Vec`` model.
    The resulting embeddings can be explored through cosine similarity and
    nearest-neighbor methods.

    :param document: Input document.
    :type document: Document
    :param lowercase: Convert words to lowercase before training.
    :type lowercase: bool
    :param base_form: Use lemmas or stems instead of surface forms.
    :type base_form: Literal["lemma", "stem"] | None
    :param allowed_pos: Keep only words with these POS tags.
    :type allowed_pos: set[str] | None
    :param remove_stop_words: Remove stop words before training.
    :type remove_stop_words: bool
    :param window_size: Context window size.
    :type window_size: int
    :param n_dimensions: Embedding dimensionality.
    :type n_dimensions: int
    :param frequency_threshold: Minimum word frequency.
    :type frequency_threshold: int
    :param return_parameters: Return only the specified workflow results.


        Available values:

        * ``context.selection.unit``
        * ``context.selection.lexical``
        * ``sentences_to_string``
        * ``trained_model``
        * ``embeddings_dataframe``
        * ``vector_similarity.geometric_measures.similarity``


    :type return_parameters: list[str] | None
    :param return_all: Return all workflow results.
    :type return_all: bool
    :param kwargs: Additional arguments passed to ``gensim.models.Word2Vec``. 
    
    For more information check: https://radimrehurek.com/gensim/models/word2vec.html

    :return: Semantic distance measurements or workflow results.
    :rtype: DistanceMeasurements | ReturnParameterGenerator | ReturnAllGenerator
    """

    def sentences_to_string(sentences) -> list[list[str]]:
        return [[word.text for word in sentence] for sentence in sentences]

    Word2VecHook: Hook = Hook(method=Word2Vec)
    Word2VecHook.set_default_args(
        {
            "sg": 1,
            "window": window_size,
            "vector_size": n_dimensions,
            "negative": n_negative_samples,
            "ns_exponent": word_probability_distribution,
            "sample": subsampling,
            "min_count": frequency_threshold,
            **kwargs,
        }
    )
    SentencesToString: Hook = Hook(method=sentences_to_string)

    Model: ModelBuilder = ModelBuilder(
        name="SGNS",
        workflow={
            "context.selection.unit": [
                {"get_sentences": {}},
            ],
            "context.selection.lexical": [
                *([{"to_lower": {}}] if lowercase else []),  # type: ignore
                *([{"to_base_form": {"base_form": base_form}}] if base_form else []),  # type: ignore
                *([{"filter_by_pos": {"allowed_pos": allowed_pos}}] if allowed_pos else []),  # type: ignore
                *([{"remove_stop_words": {"language": language, "custom_stop_words": custom_stop_words}}] if remove_stop_words else []),  # type: ignore
                {"remove_punctuation": {}},
            ],
            "sentences_to_string": SentencesToString,
            "trained_model": Word2VecHook,
            "embeddings_dataframe": ModelToDataframe,
            "vector_similarity.geometric_measures.similarity": [
                {"cos_similarity": {}},
                {"nearest_neighbors": {}},
                {"approximate_nearest_neighbors": {}},
            ],
        },
        wrapper=True,
    )

    return Model(
        input_data=document, return_parameters=return_parameters, return_all=return_all
    )



@overload
def Fasttext(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 100,
    n_negative_samples: int = 5,
    subsampling: float = 0.0001,
    frequency_threshold: int = 5,
    min_ngram_size: int = 3,
    max_ngram_size: int = 6,
    model: str = "skipgram",
    return_parameters: None = None,
    return_all: Literal[False] = False,
    **kwargs: dict[str, object],
) -> DistanceMeasurements: ...


@overload
def Fasttext(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 100,
    n_negative_samples: int = 5,
    subsampling: float = 0.0001,
    frequency_threshold: int = 5,
    min_ngram_size: int = 3,
    max_ngram_size: int = 6,
    model: str = "skipgram",
    return_parameters: list[str],
    return_all: Literal[False] = False,
    **kwargs: dict[str, object],
) -> ReturnParameterGenerator: ...


@overload
def Fasttext(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 100,
    n_negative_samples: int = 5,
    subsampling: float = 0.0001,
    frequency_threshold: int = 5,
    min_ngram_size: int = 3,
    max_ngram_size: int = 6,
    model: str = "skipgram",
    return_parameters: None = None,
    return_all: Literal[True],
    **kwargs: dict[str, object],
) -> ReturnAllGenerator: ...



[docs]
def Fasttext(
    document: Document,
    *,
    lowercase: bool = True,
    base_form: Literal["lemma", "stem"] | None = "lemma",
    allowed_pos: set[str] | None = None,
    remove_stop_words: bool = True,
    language: str | None = None,
    custom_stop_words: list[str] | None = None,
    window_size: int = 5,
    n_dimensions: int = 100,
    n_negative_samples: int = 5,
    subsampling: float = 0.0001,
    frequency_threshold: int = 5,
    min_ngram_size: int = 3,
    max_ngram_size: int = 6,
    model: str = "skipgram",
    return_parameters: list[str] | None = None,
    return_all: bool = False,
    **kwargs: dict[str, object],
) -> ReturnParameterGenerator | ReturnAllGenerator | DistanceMeasurements:
    """
    Generates word embeddings using FastText as defined by (Lenci & Sahlgren 164-165).

    The document is optionally normalized (lowercasing, lemmatization/stemming,
    POS filtering, stop-word removal) before training a FastText model.
    Subword information is used to improve representations of rare and
    out-of-vocabulary words.

    :param document: Input document.
    :type document: Document
    :param lowercase: Convert words to lowercase before training.
    :type lowercase: bool
    :param base_form: Use lemmas or stems instead of surface forms.
    :type base_form: Literal["lemma", "stem"] | None
    :param allowed_pos: Keep only words with these POS tags.
    :type allowed_pos: set[str] | None
    :param remove_stop_words: Remove stop words before training.
    :type remove_stop_words: bool
    :param window_size: Context window size.
    :type window_size: int
    :param n_dimensions: Embedding dimensionality.
    :type n_dimensions: int
    :param min_ngram_size: Minimum character n-gram length.
    :type min_ngram_size: int
    :param max_ngram_size: Maximum character n-gram length.
    :type max_ngram_size: int
    :param model: Training algorithm (``"skipgram"`` or ``"cbow"``).
    :type model: str
    :param return_parameters: Return only the specified workflow results.


        Available values:

        * ``context.selection.unit``
        * ``context.selection.lexical``
        * ``sentences_to_string``
        * ``save_sentences``
        * ``trained_model``
        * ``embeddings_dataframe``
        * ``vector_similarity.geometric_measures.similarity``


    :type return_parameters: list[str] | None
    :param return_all: Return all workflow results.
    :type return_all: bool
    :param kwargs: Additional arguments passed to FastText. 
    
    For more information check: https://fasttext.cc/docs/en/python-module.html

    :return: Semantic distance measurements or workflow results.
    :rtype: DistanceMeasurements | ReturnParameterGenerator | ReturnAllGenerator
    """

    def save_sentences(sentences: list[str]) -> str:
        with open(tempfile.name, "w") as file:
            file.write("\n".join(sentences))

        return tempfile.name

    FastTextHook: Hook = Hook(method=fasttext.train_unsupervised)
    FastTextHook.set_default_args(
        {
            "ws": window_size,
            "dim": n_dimensions,
            "neg": n_negative_samples,
            "t": subsampling,
            "minCount": frequency_threshold,
            "minn": min_ngram_size,
            "maxn": max_ngram_size,
            "model": model,
            "verbose": 0,
            **kwargs,
        }
    )
    SaveSentencesHook: Hook = Hook(method=save_sentences)

    with NamedTemporaryFile(mode="w", suffix=".txt") as tempfile:

        Model: ModelBuilder = ModelBuilder(
            name="Fasttext",
            workflow={
                "context.selection.unit": [
                    {"get_sentences": {}},
                ],
                "context.selection.lexical": [
                    *([{"to_lower": {}}] if lowercase else []),  # type: ignore
                    *([{"to_base_form": {"base_form": base_form}}] if base_form else []),  # type: ignore
                    *([{"filter_by_pos": {"allowed_pos": allowed_pos}}] if allowed_pos else []),  # type: ignore
                    *([{"remove_stop_words": {"language": language, "custom_stop_words": custom_stop_words}}] if remove_stop_words else []),  # type: ignore
                    {"remove_punctuation": {}},
                ],
                "sentences_to_string": SequencesToString,
                "save_sentences": SaveSentencesHook,
                "trained_model": FastTextHook,
                "embeddings_dataframe": ModelToDataframe,
                "vector_similarity.geometric_measures.similarity": [
                    {"cos_similarity": {}},
                    {"nearest_neighbors": {}},
                    {"approximate_nearest_neighbors": {}},
                ],
            },
            wrapper=True,
        )

    return Model(
        input_data=document, return_parameters=return_parameters, return_all=return_all
    )