Source code for dstk.parameters.co_matrix.creation.document

"""
This module provides tools for generating Word By Document Matrix within the
framework of distributional semantics. It focuses on "Documents" as a primary
category of context, where word occurrences are analyzed based on their presence
within larger structures such as articles, books, or specific records.

The module facilitates the transition from raw linguistic sequences to structured
numerical representations, allowing researchers to analyze how words appear across
different documents.

Core functionalities include:
* Converting collections of lexical item sequences (words/tokens) into a
  matrix format suitable for distributional analysis.
* Integrating with scikit-learn's CountVectorizer to handle n-grams and
  stop-word filtering during the matrix construction process.
* Generating a sparse DataFrame where rows represent unique terms and columns
  represent distinct documents (Word x Document matrix).
* Mapping internal data structures into standard pandas DataFrames for easier
  manipulation in downstream analysis.

This module serves as a foundational step in creating co-occurrence matrices
based on document-level context rather than purely local linguistic units.
"""

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

from ....utilities.data_conversion import sequence_to_string
from ....utilities.typeguards import is_words, is_tokens

from typing import Sequence, cast
from ....lib_types import csr_array, DataFrame, Word



[docs]
def create_word_by_document_matrix(
    documents_words: Sequence[Sequence[Word]],
    document_names: list[str] | None = None,
    **kwargs,
) -> DataFrame:
    """
    Creates a Word By Document Matrix.

    :param documents_words: A list of word or token object sequences.
    :type documents_words: list[Sequence[Word]]
    :param document_names: Optional list of names for the columns.
    :type document_names: list[str] | None
    :param kwargs: Additional arguments passed to scikit-learn's CountVectorizer (e.g., stop_words, ngram_range). 
    
    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

    :type kwargs: dict

    :return: Sparse co-occurrence matrix (word x documents).
    :rtype: DataFrame
    """

    if is_words(documents_words) or is_tokens(documents_words):
        raise ValueError(
            "documents must be a list of lexical item sequences, such a list or tuple of words or tokens"
        )

    documents_string: list[str] = [
        sequence_to_string(sequence) for sequence in documents_words
    ]

    vectorizer: CountVectorizer = CountVectorizer(**kwargs)

    dtm: csr_array = cast(csr_array, vectorizer.fit_transform(documents_string))
    word_by_document_matrix: csr_array = dtm.T

    return pd.DataFrame.sparse.from_spmatrix(
        word_by_document_matrix,
        index=vectorizer.get_feature_names_out(),
        columns=(
            document_names
            if document_names
            else [f"document_{index}" for index, _ in enumerate(documents_words)]
        ),
    )