Source code for dstk.parameters.co_matrix.weighting.relevance_measures

"""
This module provides functions for calculating relevance measures to weight word-document matrices.
These measures are used to assess the informativeness of lexical items within a corpus,
helping to distinguish specific content from common distributional noise.

Based on information retrieval principles, these weights combine local components (such as term frequency)
with global components (reflecting the overall informativeness of a term). By applying such filters,
the module reduces the influence of terms that appear frequently across many documents and enhances
the weight of terms associated with specific contexts.

Core functionalities include:
* Applying Term Frequency-Inverse Document Frequency (TF-IDF) to word-by-document matrices.
* Weighting lexical items based on their global informativeness across a training corpus.
* Transforming raw word counts into weighted representations suitable for linguistic analysis and
  distributional modeling.

The module is intended to provide tools for quantifying term relevance in the context of
co-occurrence matrices and distributional representation.
"""

import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from ....utilities.dataframe_manipulation import is_sparse_dataframe
from scipy.sparse import coo_array

from ....lib_types import DataFrame, csr_array



[docs]
def tf_idf(word_by_document_matrix: DataFrame, **kwargs) -> DataFrame:
    """
    Weights a Word By Document Matrix using TF-IDF.

    :param word_by_document_matrix: A DataFrame representing word-document counts.
    :type word_by_document_matrix: DataFrame
    :param kwargs: Additional arguments for scikit-learn's TfidfTransformer.

    :return: Sparse weight-adjusted Word By Document Matrix. 
    
    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    
    :rtype: DataFrame
    """

    word_by_document_matrix_array: coo_array = (
        word_by_document_matrix.sparse.to_coo()
        if is_sparse_dataframe(word_by_document_matrix)
        else coo_array(word_by_document_matrix)
    )

    transformer: TfidfTransformer = TfidfTransformer(**kwargs)
    tf_idf_matrix: csr_array = transformer.fit_transform(
        word_by_document_matrix_array.T
    ).T

    return pd.DataFrame.sparse.from_spmatrix(
        tf_idf_matrix,
        index=word_by_document_matrix.index,
        columns=word_by_document_matrix.columns,
    ).fillna(0)