Source code for dstk.lib_types.dstk_types

from typing import TypeAlias, TypeVar, Any, TypedDict, NotRequired, NamedTuple, Generator
from .spacy_types import Token
from .sklearn_types import csc_matrix, csr_matrix
from .numpy_types import ndarray, NDArray, str_
from .pandas_types import Index
from .fasttext_types import FastText
from .gensim_types import Word2Vec
from collections import Counter

#: Numeric types accepted (integer or float).
Number: TypeAlias = int | float

#: A generic type variable for words, bounded to str or spaCy Token.
Word = TypeVar("Word", bound=str | Token)

#: A list of words (strings or spaCy Tokens).
Words: TypeAlias = list[Word]

#: A tuple representing a group of collocates (words).
Collocates: TypeAlias = tuple[Word, ...]

#: A list of collocate tuples.
CollocatesList = list[Collocates]


[docs]
class POSTaggedWord(NamedTuple):
    """
    Represents a word paired with its Part-Of-Speech (POS) tag.

    :param word: The word, either as a string or spaCy Token.
    :type word: str or Token
    :param pos: The POS tag of the word.
    :type pos: str
    """

    word: str | Token
    pos: str


#: A list of POS-tagged words.
POSTaggedWordList: TypeAlias = list[POSTaggedWord]


[docs]
class Bigram(NamedTuple):
    """
    Represents a bigram collocation between two words.

    :param collocate: The collocate word.
    :type collocate: str or Token
    :param target_word: The target word in the bigram.
    :type target_word: str
    """

    collocate: str | Token
    target_word: str


#: A list of bigram tuples.
BigramList: TypeAlias = list[Bigram]

#: Directed collocates represented as a tuple of a word and a pair of directional tags.
DirectedCollocates: TypeAlias = tuple[Word, tuple[str, str]]

#: A list of directed collocates.
DirectedCollocateList: TypeAlias = list[DirectedCollocates]

#: Union type of all tagged word lists.
TaggedWordsList: TypeAlias = CollocatesList | DirectedCollocateList | POSTaggedWordList | BigramList

#: A list of sentences, where each sentence is a list of words.c
WordSenteces: TypeAlias = list[Words]
#: A list of tagged sentences, each containing tagged words.c
TaggedSentences: TypeAlias = list[TaggedWordsList]

#: Union type representing either plain or tagged sentences.
Sentences: TypeAlias = WordSenteces | TaggedSentences


#: A tuple representing a neighboring word and its association score.

[docs]
class Neighbor(NamedTuple):
    word: str
    score: float

#: A list of neighboring words with scores.
Neighbors: TypeAlias = list[Neighbor]

#: A union of neural language model types.
NeuralModels: TypeAlias = Word2Vec | FastText

#: A counter mapping words (strings) to their frequency counts.
WordCounts: TypeAlias = Counter[str]

#: A union of matrix types from SciPy or NumPy.
Matrix: TypeAlias = csr_matrix | csc_matrix | ndarray

#: Labels used in pandas DataFrames, representing index or column labels.
#:
#: This can be a NumPy ndarray of strings, a pandas Index, a list of strings, or None.
Labels: TypeAlias = NDArray[str_] | Index | list[str] | None

StepConfig = TypedDict(
    "StepConfig",
    {
        "include": NotRequired[list[str] | str],
        "exclude": NotRequired[dict[str, int]],
        "repeat": bool,
        "chaining": bool,
        "step_name": str
    },
    total=True
)
"""
Configuration for a processing step in a workflow.

:param include: Methods to include, either a list of strings or a single string.
:type include: list[str] or str, optional
:param exclude: Methods to exclude, as a dictionary mapping strings to integers.
:type exclude: dict[str, int], optional
:param repeat: Whether the a method can be used more than once.
:type repeat: bool
:param chaining: Whether method cchaining is enabled.
:type chaining: bool
:param step_name: The name of the step.
:type step_name: str
"""

WorkflowTemplate = TypedDict(
    "WorkflowTemplate",
    {
        "steps": dict[int, StepConfig],
        "base_type": str,
        "triggers": dict[str, str]
    }
)
"""
Template for an entire workflow, consisting of steps, a base type and triggers.

:param steps: Mapping from step numbers to step configurations.
:type steps: dict[int, StepConfig]
:param base_type: The base type of the workflow.
:type base_type: str
:param triggers: Mapping from method names to the data types they produce. When a method changes the current data type (the default return type),the corresponding trigger activates rules that enable or disable subsequent methods.
:type triggers: dict[str, str]
"""

#: A workflow is a list of ordered steps, where each step is a dictionary
#: mapping method names to their keyword arguments.
Workflow: TypeAlias = list[dict[str, dict[str, Any]]]
#: A stage workflow contains multiple workflows organized by module names.
#: Each key is a module name (e.g., 'tokenizer', 'ngrams', 'text_processor'),
#: and the value is the workflow steps for that module.
StageWorkflow: TypeAlias = dict[str, Workflow]
#: Mapping from stage names to their corresponding workflow templates.
#:
#: Each key is a stage name (a string identifying a module),
#: and the value is a `WorkflowTemplate` describing the processing steps and triggers
#: allowed in that stage.
StageTemplate: TypeAlias = dict[str, WorkflowTemplate]
#: Mapping from stage indices (integers) to sets of module names allowed in that stage.
#:
#: Each key is a stage number, and the value is a set of module names (strings) that
#: are enabled or active during that stage of the stage workflow.
StageModules: TypeAlias = dict[int, set[str]]

ExcludedMethods = TypedDict(
    "ExcludedMethods",
    {
        "exclude": list[str] | str
    }
)
"""
Specifies methods to exclude by name.

:param exclude: A list of method names or a single method name to exclude.
:type exclude: list[str] or str
"""

#: Template defining rules for excluding methods once a specific type is triggered.
#:
#: The outer dictionary keys are module names (e.g., 'tokenizer', 'text_processor'),
#: and the values specify which methods should be excluded in that module.
#:
#: For example, when the data type changes to 'POSTaggedWordList', these rules
#: prevent further usage of specific methods like 'pos_tagger' in the tokenizer module
RulesTemplate: TypeAlias = dict[str, ExcludedMethods]


[docs]
class StepResult(NamedTuple):
    """
    Represents the result of executing a single workflow or model step.

    :param name: The name of thec step.
    :param result: The output produced by the step.
    """

    name: str
    result: Any



#: Generator that yields `StepResult` objects, each representing the name and result of a workflow step.
StepGenerator: TypeAlias = Generator[StepResult, None, None]

#: Generator that yields results of workflow steps without step metadata.
ResultGenerator: TypeAlias = Generator[Any, None, None]