Source code for dstk.modules.predict_models
"""
This module provides utilities to train, save, and load word embedding models using neural networks models such as Word2Vec (gensim) and FastText (fasttext library).
Functions include:
* *word2vec:* Train Word2Vec embeddings from a corpus file.
* *fastText:* Train FastText embeddings from a corpus file.
* *load_model:* Load a saved model from disk (supports Word2Vec .model and FastText .bin formats).
* *save_model:* Save a trained model to disk in the appropriate format.
Each function supports passing additional keyword arguments to fine-tune training and loading.
"""
from gensim.models import Word2Vec
import fasttext
from pathlib import Path
from ..lib_types import FastText, NeuralModels
[docs]
def word2vec(path: str, **kwargs) -> Word2Vec:
"""
Creates word embeddings using the Word2Vec algorithm.
:param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
:type path: str
:param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
* **vector_size:** Size of the word embedding vectors.
* **workers:** Number of CPU cores to be used during the training process.
* **sg:** Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
* **window (int):** Maximum distance between the current and predicted word.
* **min_count (int):** Ignores all words with total frequency lower than this.
For more information check: https://radimrehurek.com/gensim/models/word2vec.html
:returns: An instance of gensim's Word2Vec.
:rtype: Word2Vec
"""
return Word2Vec(
corpus_file=path,
**kwargs
)
[docs]
def fastText(path: str, **kwargs) -> FastText:
"""
Creates word embeddings using the FastText algorithm.
:param path: The path to a file containing a list of sentences or collocations from which to build word embeddings.
:type path: str
:param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised. Common options include:
* **dim:** Size of the word embedding vectors.
* **model:** Training algorithm: skipgram or cbow (Continuous Bag of Words)
* **thread:** Number of CPU cores to be used during the training process.
For more information check: https://fasttext.cc/docs/en/options.html
:returns: An instance of fasttext's FastText.
:rtype: FastText
"""
return fasttext.train_unsupervised(
path,
**kwargs
)
[docs]
def load_model(path: str) -> NeuralModels:
"""
Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
:param path: Path to the saved model file.
:type path: str
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
:rtype: NeuralModels
"""
extension: str = Path(path).suffix.lower()
if extension == ".model":
return Word2Vec.load(path)
elif extension == ".bin":
return fasttext.load_model(path)
else:
raise ValueError(f"Model extension {extension} not recognized.")
[docs]
def save_model(model: NeuralModels, path: str) -> str:
"""
Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
:param model: A trained Word2Vec or FastText model.
:type model: NeuralModels
:param path: The path (without extension) where to save the model.
:type path: str
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
:rtype: NeuralModels
"""
full_path: Path = Path(path)
if isinstance(model, Word2Vec):
model.save(str(full_path.with_suffix(".model")))
elif isinstance(model, FastText):
model.save_model(str(full_path.with_suffix(".bin")))
else:
raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
return str(full_path.resolve())