Source code for dstk.parameters.co_matrix.creation.linguistic.word.window

"""
This module provides tools for constructing word-by-word co-occurrence matrices based on
"Lexemes" within the framework of distributional semantics. Specifically, it focuses
on "Window-based collocates," where the context of a word is determined by its
proximity to other words in a sequence.

The module facilitates the transition from raw linguistic sequences to structured
mathematical representations, allowing researchers to analyze how words appear together
within specific windows.

Core functionalities include:

* Generating co-occurrence matrices (word $\times$ word) from lists of tokenized contexts.
* Leveraging standard vectorization tools to handle preprocessing such as stop_words and n-grams.
* Converting sparse mathematical matrices into labeled DataFrames for easier analysis by
  linguists and researchers.
* Providing a framework for calculating how words relate to one another based on
  spatial proximity within the text.

This module is intended for use when analyzing lexical relationships where the
physical distance between words (the "window") defines their relationship.
"""

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from scipy.sparse import csr_array
from ......utilities.data_conversion import sequence_to_string

from typing import Sequence, cast
from ......lib_types import csr_array, DataFrame, Word



[docs]
def create_word_by_word_matrix(contexts: Sequence[Sequence[Word]], **kwargs) -> DataFrame:
    """
    Build a Word By Word Matrix from tokenized contexts.

    :param contexts: A list of word or token object sequences.
    :type contexts: list[Sequence[Word]]
    :param kwargs: Arguments for sklearn CountVectorizer (e.g. stop_words, ngram_range). 
    
    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

    :return: Sparse co-occurrence matrix (feature x feature).
    :rtype: DataFrame
    """

    contexts_string: list[str] = [
        sequence_to_string(word_sequence) for word_sequence in contexts
    ]

    vectorizer: CountVectorizer = CountVectorizer(**kwargs)

    dtm: csr_array = cast(csr_array, vectorizer.fit_transform(contexts_string))

    co_matrix: csr_array = dtm.T @ dtm

    return pd.DataFrame.sparse.from_spmatrix(
        co_matrix,
        index=vectorizer.get_feature_names_out(),
        columns=vectorizer.get_feature_names_out(),
    )