Source code for dstk.parameters.context.selection.unit

"""
This module provides utility functions for extracting structured linguistic data from Stanza Documents.
It simplifies the process of converting processed documents into manageable Python lists
of sentences, tokens, and words to facilitate further analysis.

Core functionalities include:

* Extracting a sequence of sentences from a Stanza Document
* Retrieving all tokens (including punctuation) as a list of Token objects
* Isolating "words" by extracting items and filtering out punctuation marks
* Providing copies of linguistic objects to ensure data integrity during processing

The module is designed to streamline the transition between raw NLP output and
structured text analysis for linguistics-focused workflows.
"""

from copy import copy

from ....lib_types import Document, Sentence, Token, Word



[docs]
def get_sentences(document: Document) -> list[Sentence]:
    """
    Extracts a list of sentences from a Stanza Document.

    :param document: The Stanza Document object.
    :type document: Document

    :return: A list of sentence objects.
    :rtype: list[Sentence]
    """

    return [copy(sentence) for sentence in document.sentences]




[docs]
def get_tokens(document: Document) -> list[Token]:
    """
    Extracts all tokens from a Stanza Document as a list.

    :param document: The Stanza Document object.
    :type document: Document

    :return: A list of token objects.
    :rtype: list[Token]
    """

    return [copy(token) for token in document.iter_tokens()]




[docs]
def get_words(document: Document) -> list[Word]:
    """
    Extracts words from a Stanza Document, excluding punctuation marks.

    :param document: The Stanza Document object.
    :type document: Document

    :return: A list of word objects.
    :rtype: list[Word]
    """

    return [copy(word) for word in document.iter_words() if word.upos != "PUNCT"]