Source code for dstk.lib_types.dstk_types

from typing import TypeAlias, TypeVar, Any, TypedDict, NotRequired, NamedTuple, Generator
from .spacy_types import Token
from .sklearn_types import csc_matrix, csr_matrix
from .numpy_types import ndarray, NDArray, str_
from .pandas_types import Index
from .fasttext_types import FastText
from .gensim_types import Word2Vec
from collections import Counter

#: Numeric types accepted (integer or float).
Number: TypeAlias = int | float

#: A generic type variable for words, bounded to str or spaCy Token.
Word = TypeVar("Word", bound=str | Token)

#: A list of words (strings or spaCy Tokens).
Words: TypeAlias = list[Word]

#: A tuple representing a group of collocates (words).
Collocates: TypeAlias = tuple[Word, ...]

#: A list of collocate tuples.
CollocatesList = list[Collocates]

[docs] class POSTaggedWord(NamedTuple): """ Represents a word paired with its Part-Of-Speech (POS) tag. :param word: The word, either as a string or spaCy Token. :type word: str or Token :param pos: The POS tag of the word. :type pos: str """ word: str | Token pos: str
#: A list of POS-tagged words. POSTaggedWordList: TypeAlias = list[POSTaggedWord]
[docs] class Bigram(NamedTuple): """ Represents a bigram collocation between two words. :param collocate: The collocate word. :type collocate: str or Token :param target_word: The target word in the bigram. :type target_word: str """ collocate: str | Token target_word: str
#: A list of bigram tuples. BigramList: TypeAlias = list[Bigram] #: Directed collocates represented as a tuple of a word and a pair of directional tags. DirectedCollocates: TypeAlias = tuple[Word, tuple[str, str]] #: A list of directed collocates. DirectedCollocateList: TypeAlias = list[DirectedCollocates] #: Union type of all tagged word lists. TaggedWordsList: TypeAlias = CollocatesList | DirectedCollocateList | POSTaggedWordList | BigramList #: A list of sentences, where each sentence is a list of words.c WordSenteces: TypeAlias = list[Words] #: A list of tagged sentences, each containing tagged words.c TaggedSentences: TypeAlias = list[TaggedWordsList] #: Union type representing either plain or tagged sentences. Sentences: TypeAlias = WordSenteces | TaggedSentences #: A tuple representing a neighboring word and its association score.
[docs] class Neighbor(NamedTuple): word: str score: float
#: A list of neighboring words with scores. Neighbors: TypeAlias = list[Neighbor] #: A union of neural language model types. NeuralModels: TypeAlias = Word2Vec | FastText #: A counter mapping words (strings) to their frequency counts. WordCounts: TypeAlias = Counter[str] #: A union of matrix types from SciPy or NumPy. Matrix: TypeAlias = csr_matrix | csc_matrix | ndarray #: Labels used in pandas DataFrames, representing index or column labels. #: #: This can be a NumPy ndarray of strings, a pandas Index, a list of strings, or None. Labels: TypeAlias = NDArray[str_] | Index | list[str] | None StepConfig = TypedDict( "StepConfig", { "include": NotRequired[list[str] | str], "exclude": NotRequired[dict[str, int]], "repeat": bool, "chaining": bool, "step_name": str }, total=True ) """ Configuration for a processing step in a workflow. :param include: Methods to include, either a list of strings or a single string. :type include: list[str] or str, optional :param exclude: Methods to exclude, as a dictionary mapping strings to integers. :type exclude: dict[str, int], optional :param repeat: Whether the a method can be used more than once. :type repeat: bool :param chaining: Whether method cchaining is enabled. :type chaining: bool :param step_name: The name of the step. :type step_name: str """ WorkflowTemplate = TypedDict( "WorkflowTemplate", { "steps": dict[int, StepConfig], "base_type": str, "triggers": dict[str, str] } ) """ Template for an entire workflow, consisting of steps, a base type and triggers. :param steps: Mapping from step numbers to step configurations. :type steps: dict[int, StepConfig] :param base_type: The base type of the workflow. :type base_type: str :param triggers: Mapping from method names to the data types they produce. When a method changes the current data type (the default return type),the corresponding trigger activates rules that enable or disable subsequent methods. :type triggers: dict[str, str] """ #: A workflow is a list of ordered steps, where each step is a dictionary #: mapping method names to their keyword arguments. Workflow: TypeAlias = list[dict[str, dict[str, Any]]] #: A stage workflow contains multiple workflows organized by module names. #: Each key is a module name (e.g., 'tokenizer', 'ngrams', 'text_processor'), #: and the value is the workflow steps for that module. StageWorkflow: TypeAlias = dict[str, Workflow] #: Mapping from stage names to their corresponding workflow templates. #: #: Each key is a stage name (a string identifying a module), #: and the value is a `WorkflowTemplate` describing the processing steps and triggers #: allowed in that stage. StageTemplate: TypeAlias = dict[str, WorkflowTemplate] #: Mapping from stage indices (integers) to sets of module names allowed in that stage. #: #: Each key is a stage number, and the value is a set of module names (strings) that #: are enabled or active during that stage of the stage workflow. StageModules: TypeAlias = dict[int, set[str]] ExcludedMethods = TypedDict( "ExcludedMethods", { "exclude": list[str] | str } ) """ Specifies methods to exclude by name. :param exclude: A list of method names or a single method name to exclude. :type exclude: list[str] or str """ #: Template defining rules for excluding methods once a specific type is triggered. #: #: The outer dictionary keys are module names (e.g., 'tokenizer', 'text_processor'), #: and the values specify which methods should be excluded in that module. #: #: For example, when the data type changes to 'POSTaggedWordList', these rules #: prevent further usage of specific methods like 'pos_tagger' in the tokenizer module RulesTemplate: TypeAlias = dict[str, ExcludedMethods]
[docs] class StepResult(NamedTuple): """ Represents the result of executing a single workflow or model step. :param name: The name of thec step. :param result: The output produced by the step. """ name: str result: Any
#: Generator that yields `StepResult` objects, each representing the name and result of a workflow step. StepGenerator: TypeAlias = Generator[StepResult, None, None] #: Generator that yields results of workflow steps without step metadata. ResultGenerator: TypeAlias = Generator[Any, None, None]