Source code for dstk.adaptors.typeguards
"""
Provides a set of type guard functions to safely and explicitly check the types of various token and workflow-related objects.
These functions help with runtime type checking and enable more precise type hinting and static analysis when working with linguistic data structures such as:
* POS-tagged word lists
* Collocates lists
* Sentences (token or string sequences)
* Workflow step definitions
* Token-based collocates
By using these type guards, code can branch safely based on the structure and types of input data, improving robustness and developer experience.
Example:
.. code-block:: python
if is_pos_tags(tokens):
# tokens is now narrowed to POSTaggedWordList type
process_pos_tags(tokens)
"""
from typing import Any, TypeGuard
from ..lib_types import POSTaggedWordList, CollocatesList, Sentences, Token, Workflow, POSTaggedWord, Bigram, Collocates
[docs]
def is_collocates(tokens: Any) -> TypeGuard[CollocatesList]:
"""
Checks if the input is a list of collocate tuples, where each tuple contains strings or Token instances, cexcluding types like POSTaggedWord or Bigram.
:param tokens: The object to check.
:type tokens: Any
:return: True if `tokens` is a non-empty list of tuples of strings or Token instances (excluding POSTaggedWord and Bigram), otherwise False.
:rtype: bool
"""
if not isinstance(tokens, list) or not tokens:
return False
return all(
isinstance(item, tuple) and
not isinstance(item, POSTaggedWord) and
not isinstance(item, Bigram) and
all(
isinstance(word, str) or
isinstance(word, Token)
for word in item
)
for item in tokens
)
[docs]
def is_sentence(tokens: Any) -> TypeGuard[Sentences]:
"""
Checks if the input is a list of sentences, where each sentence is either:
* A list of Token instances,
* A list of strings, or
* A list of POSTaggedWord instances.
:param tokens: The object to check.
:type tokens: Any
:return: True if `tokens` matches the described sentence structure, otherwise False.
:rtype: bool
"""
if not isinstance(tokens, list) or not tokens:
return False
return (
isinstance(tokens, list) and
all(
(
isinstance(item, list) and
(
all(isinstance(token, Token) for token in item) or
all(isinstance(token, str) for token in item)
)
) or is_pos_tags(item)
for item in tokens
)
)
[docs]
def is_workflow(workflow: Any) -> TypeGuard[Workflow]:
"""
Checks if the input is a workflow structure, i.e., a non-empty list of dictionaries where each dictionary maps string method names to argument dictionaries with string keys.
:param workflow: The object to check.
:type workflow: Any
:return: True if `workflow` matches the workflow structure, otherwise False.
:rtype: bool
"""
if not isinstance(workflow, list) or not workflow:
return False
return all(
isinstance(method, dict) and
all(
isinstance(key, str) and
isinstance(value, dict) and
all(
isinstance(arg, str)
for arg in value.keys()
)
for key, value in method.items()
)
for method in workflow
)
[docs]
def is_token_collocates(collocates: Collocates) -> TypeGuard[Collocates[Token]]:
"""
Checks if the input collocates tuple consists exclusively of Token instances and excludes Bigram and POSTaggedWord types.
:param collocates: The collocates tuple to check.
:type collocates: Collocates
:return: True if all elements in `collocates` are Token instances and not Bigram or POSTaggedWord, otherwise False.
:rtype: bool
"""
return isinstance(collocates, tuple) and not isinstance(collocates, Bigram) and not isinstance(collocates, POSTaggedWord) and all(
isinstance(token, Token)
for token in collocates
)