Source code for dstk.modules.predict_models

"""
This module provides utilities to train, save, and load word embedding models using neural networks models such as Word2Vec (gensim) and FastText (fasttext library).

Functions include:

* *word2vec:* Train Word2Vec embeddings from a corpus file.
* *fastText:* Train FastText embeddings from a corpus file.
* *load_model:* Load a saved model from disk (supports Word2Vec .model and FastText .bin formats).
* *save_model:* Save a trained model to disk in the appropriate format.

Each function supports passing additional keyword arguments to fine-tune training and loading.
"""

from gensim.models import Word2Vec
import fasttext
from pathlib import Path

from ..lib_types import FastText, NeuralModels


[docs]
def word2vec(path: str, **kwargs) -> Word2Vec:
    """
    Creates word embeddings using the Word2Vec algorithm.

    :param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
    :type path: str
    :param kwargs:  Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:

        * **vector_size:** Size of the word embedding vectors.
        * **workers:** Number of CPU cores to be used during the training process.
        * **sg:** Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
        * **window (int):** Maximum distance between the current and predicted word.
        * **min_count (int):** Ignores all words with total frequency lower than this.

    For more information check: https://radimrehurek.com/gensim/models/word2vec.html

    :returns: An instance of gensim's Word2Vec.
    :rtype: Word2Vec
    """

    return Word2Vec(
        corpus_file=path,
        **kwargs
    )



[docs]
def fastText(path: str, **kwargs) -> FastText:
    """
    Creates word embeddings using the FastText algorithm.

    :param path: The path to a file containing a list of sentences or collocations from which to build word embeddings.
    :type path: str
    :param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised. Common options include:

        * **dim:** Size of the word embedding vectors.
        * **model:** Training algorithm: skipgram or cbow (Continuous Bag of Words)
        * **thread:** Number of CPU cores to be used during the training process.

    For more information check: https://fasttext.cc/docs/en/options.html
    
    :returns: An instance of fasttext's FastText.
    :rtype: FastText
    """

    return fasttext.train_unsupervised(
        path,
        **kwargs
    )



[docs]
def load_model(path: str) -> NeuralModels:
    """
    Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.

    :param path: Path to the saved model file.
    :type path: str

    :returns: An instance of gensim's Word2Vec or fasttext's FastText.
    :rtype: NeuralModels
    """

    extension: str = Path(path).suffix.lower()

    if extension == ".model":
        return Word2Vec.load(path)
    elif extension == ".bin":
        return fasttext.load_model(path)
    else:
        raise ValueError(f"Model extension {extension} not recognized.")



[docs]
def save_model(model: NeuralModels, path: str) -> str:
    """
    Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.

    :param model: A trained Word2Vec or FastText model.
    :type model: NeuralModels
    :param path: The path (without extension) where to save the model.
    :type path: str

    :returns: An instance of gensim's Word2Vec or fasttext's FastText.
    :rtype: NeuralModels
    """
    full_path: Path = Path(path)

    if isinstance(model, Word2Vec):
        model.save(str(full_path.with_suffix(".model")))
    elif isinstance(model, FastText):
        model.save_model(str(full_path.with_suffix(".bin")))
    else:
        raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
    
    return str(full_path.resolve())