Source code for dstk.modules.weight_matrix

"""
This module provides functions to apply weighting schemes to co-occurrence matrices commonly used in natural language processing and text mining.

Available weighting methods include:

* Pointwise Mutual Information (PMI) and Positive PMI (PPMI), which measure the association strength between co-occurring terms by comparing observed co-occurrence frequencies to expected frequencies under independence.
* Term Frequency-Inverse Document Frequency (Tf-idf), which reweights term importance based on frequency patterns, leveraging sklearn's TfidfTransformer.

These weighting techniques help enhance the semantic relevance of co-occurrence matrices, improving downstream tasks such as word embedding, clustering, and semantic similarity analysis.

All functions return weighted co-occurrence matrices as Pandas DataFrames for convenient further analysis.
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer

from ..lib_types import DataFrame, ndarray, Series, csr_matrix


[docs] def pmi(co_matrix: DataFrame, positive: bool = False) -> DataFrame: """ Weights a Co-occurrence matrix by PMI or PPMI. :param co_matrix: A Co-occurrence matrix to be weighted. :type co_matrix: DataFrame :param positive: If True, weights the Co-ocurrence matrix by PPMI. If False, weighths it by PMI. Defaults to False. :type positive: bool :returns: A Co-occurrence matrix weighted by PMI or PPMI. :rtype: DataFrame """ df: DataFrame = co_matrix col_totals: Series = df.sum(axis=0) total: float = col_totals.sum() row_totals: Series = df.sum(axis=1) expected: ndarray = np.outer(row_totals, col_totals) / total df = df / expected # Silence distracting warnings about log(0): with np.errstate(divide='ignore'): df = np.log(df) df[np.isinf(df)] = 0.0 # log(0) = 0 if positive: df[df < 0] = 0.0 return df
[docs] def tf_idf(co_matrix: DataFrame, **kwargs) -> DataFrame: """ Weights a Co-occurrence matrix by Tf-idf. :param co_matrix: A Co-occurrence matrix to be weighted. :type co_matrix: DataFrame :param kwargs: Additional keyword arguments to pass to sklearn's TfidfTransformer. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html :returns: A Co-occurrence matrix weighted by Tf-idf. :rtype: DataFrame """ transformer: TfidfTransformer = TfidfTransformer(**kwargs) tf_idf_matrix: csr_matrix = transformer.fit_transform(co_matrix) return pd.DataFrame(tf_idf_matrix.toarray(), index=co_matrix.index, columns=co_matrix.columns)