Source code for dstk.modules.weight_matrix
"""
This module provides functions to apply weighting schemes to co-occurrence matrices commonly used in natural language processing and text mining.
Available weighting methods include:
* Pointwise Mutual Information (PMI) and Positive PMI (PPMI), which measure the association strength between co-occurring terms by comparing observed co-occurrence frequencies to expected frequencies under independence.
* Term Frequency-Inverse Document Frequency (Tf-idf), which reweights term importance based on frequency patterns, leveraging sklearn's TfidfTransformer.
These weighting techniques help enhance the semantic relevance of co-occurrence matrices, improving downstream tasks such as word embedding, clustering, and semantic similarity analysis.
All functions return weighted co-occurrence matrices as Pandas DataFrames for convenient further analysis.
"""
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from ..lib_types import DataFrame, ndarray, Series, csr_matrix
[docs]
def pmi(co_matrix: DataFrame, positive: bool = False) -> DataFrame:
"""
Weights a Co-occurrence matrix by PMI or PPMI.
:param co_matrix: A Co-occurrence matrix to be weighted.
:type co_matrix: DataFrame
:param positive: If True, weights the Co-ocurrence matrix by PPMI. If False, weighths it by PMI. Defaults to False.
:type positive: bool
:returns: A Co-occurrence matrix weighted by PMI or PPMI.
:rtype: DataFrame
"""
df: DataFrame = co_matrix
col_totals: Series = df.sum(axis=0)
total: float = col_totals.sum()
row_totals: Series = df.sum(axis=1)
expected: ndarray = np.outer(row_totals, col_totals) / total
df = df / expected
# Silence distracting warnings about log(0):
with np.errstate(divide='ignore'):
df = np.log(df)
df[np.isinf(df)] = 0.0 # log(0) = 0
if positive:
df[df < 0] = 0.0
return df
[docs]
def tf_idf(co_matrix: DataFrame, **kwargs) -> DataFrame:
"""
Weights a Co-occurrence matrix by Tf-idf.
:param co_matrix: A Co-occurrence matrix to be weighted.
:type co_matrix: DataFrame
:param kwargs: Additional keyword arguments to pass to sklearn's TfidfTransformer. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
:returns: A Co-occurrence matrix weighted by Tf-idf.
:rtype: DataFrame
"""
transformer: TfidfTransformer = TfidfTransformer(**kwargs)
tf_idf_matrix: csr_matrix = transformer.fit_transform(co_matrix)
return pd.DataFrame(tf_idf_matrix.toarray(), index=co_matrix.index, columns=co_matrix.columns)