Source code for dstk.parameters.dimensionality_reduction

"""
This module offers functionality to transform and reduce high-dimensional text data represented as matrices, enabling more effective downstream analysis and modeling.

Key features include:

* Scaling input matrices to zero mean and unit variance using standardization.
* Generating low-dimensional word embeddings from co-occurrence matrices using dimensionality reduction techniques:
* Truncated Singular Value Decomposition (SVD)
* Principal Component Analysis (PCA)

These techniques help distill semantic information from sparse and high-dimensional co-occurrence data, facilitating tasks such as clustering, visualization, and feature extraction in natural language processing pipelines.

All functions return results as Pandas DataFrames for seamless integration with data workflows.
"""

from sklearn.decomposition import PCA, TruncatedSVD
import pandas as pd
from ..utilities.dataframe_manipulation import is_sparse_dataframe
from scipy.sparse import coo_array

from ..lib_types import ndarray, DataFrame


[docs] def svd(matrix: DataFrame, n_dimensions: int = 300, **kwargs) -> DataFrame: """ Generates word embeddings using truncated Single Value Descomposition (SVD). :param matrix: A Co-occurrence matrix from which embeddings will be generated. :type matrix: DataFrame :param n_dimensions: The number of dimensions to reduce the word embeddings to. Defaults to 300. :type n_dimensions: int :param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html :returns: A DataFrame of word embeddings generated by SVD. :rtype: DataFrame """ matrix_array: coo_array = ( matrix.sparse.to_coo() if is_sparse_dataframe(matrix) else coo_array(matrix) ) svd: TruncatedSVD = TruncatedSVD( n_components=n_dimensions, random_state=42, **kwargs ) embeddings: ndarray = svd.fit_transform(matrix_array) shape: int = embeddings.shape[1] columns: list[str] = [f"dim_{num}" for num in range(shape)] return pd.DataFrame(embeddings, index=matrix.index, columns=columns)
[docs] def pca(matrix: DataFrame, n_dimensions: int | float = 300, **kwargs) -> DataFrame: """ Generates word embeddings using Principal Component Analysis (PCA). :param matrix: A Co-occurrence matrix from which embeddings will be generated. :type matrix: DataFrame :param n_dimensions: If an integer, the number of dimensions to reduce the word embeddings to. If a float between 0 and 1, specifies the proportion of variance to preserve. Defaults to 300. :type n_dimensions: int or float :param kwargs: Additional keyword arguments to pass to sklearn's PCA. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html :returns: A DataFrame of word embeddings generated by PCA. :rtype: DataFrame """ matrix_array: coo_array = ( matrix.sparse.to_coo() if is_sparse_dataframe(matrix) else coo_array(matrix) ) pca: PCA = PCA(n_components=n_dimensions, random_state=42, **kwargs) embeddings: ndarray = pca.fit_transform(matrix_array) shape: int = embeddings.shape[1] columns: list[str] = [f"dim_{num}" for num in range(shape)] return pd.DataFrame(embeddings, index=matrix.index, columns=columns)