Source code for dstk.parameters.context.extraction.linguistic.word.window

"""
This module provides tools for extracting context based on "Linguistic Units" by
identifying "Window-based collocates." Within the framework of distributional semantics,
this module focuses on Lexeme contexts—extracting words that appear in close
proximity to a target word within a defined window.

Core functionalities include:

* Extracting collocates from specified left and right windows around a target word.
* Filtering context windows based on part-of-speech (POS) tags to refine linguistic data.
* Generating directed bigrams, which specify the directional relationship between
  a context word and a target word (e.g., Left or Right).
* Extracting undirected bigrams where only physical proximity is considered.
* Generating n-grams from sequences of lexical items for fixed-length
  proximity analysis.

The module is specifically tailored for Stanza Word objects and sequences of Lexical Items.
"""

from nltk.util import ngrams

from typing import Sequence, cast
from ......lib_types import (
    Collocates,
    DirectedCollocates,
    Bigram,
    Word,
    ContextGenerator,
    LexicalItem,
)


def _find_contexts(
    words: list[Word],
    target_word: str,
    window_size: tuple[int, int],
    allowed_pos: set[str] | None = None,
) -> ContextGenerator:
    """
    Yields left and right contexts for each occurrence of a target word.

    :param words: A list of Stanza Word objects.
    :type words: list[Word]
    :param target_word: The text of the word to find within the list.
    :type target_word: str
    :param window_size: A tuple representing the number of words to include before and after.
    :type window_size: tuple[int, int]
    :param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
    :type allowed_pos: set[str] | None

    :yield: A generator yielding tuples of left and right context lists.
    :rtype: ContextGenerator
    """

    for index, word in enumerate(words):
        left_pos_count: int = 0
        right_pos_count: int = 0

        left_context: list[Word] = []
        right_context: list[Word] = []

        if word.text == target_word:
            if allowed_pos:
                for left_index in range(index - 1, -1, -1):
                    if left_pos_count == window_size[0]:
                        left_context = sorted(left_context, key=lambda word: word.id)
                        break

                    left_context.append(words[left_index])
                    if words[left_index].upos in allowed_pos:
                        left_pos_count += 1

                for right_index in range(index + 1, len(words)):
                    if right_pos_count == window_size[1]:
                        break

                    right_context.append(words[right_index])
                    if words[right_index].upos in allowed_pos:
                        right_pos_count += 1
            else:
                start: int = max(0, index - window_size[0])
                end: int = min(len(words), index + window_size[1] + 1)

                left_context = words[start:index]
                right_context = words[index + 1 : end]

            yield (left_context, right_context)


[docs] def extract_collocates( words: list[Word], target_word: str, window_size: tuple[int, int], allowed_pos: set[str] | None = None, ) -> list[Collocates]: """ Extracts context words around a target word as flat tuples. :param words: A list of Stanza Word objects. :type words: list[Word] :param target_word: The word to find within the list. :type target_word: str :param window_size: A tuple representing the left and right window sizes. :type window_size: tuple[int, int] :param allowed_pos: Optional set of POS tags to filter context words, defaults to None. :type allowed_pos: set[str] | None :return: A list of word tuples matching the window constraints. :rtype: list[Collocates] """ return [ tuple(left + right) for left, right in _find_contexts(words, target_word, window_size, allowed_pos) ]
[docs] def extract_directed_bigrams( words: list[Word], target_word: str, window_size: tuple[int, int], allowed_pos: set[str] | None = None, ) -> list[DirectedCollocates]: """ Extracts directed bigrams (tagged with context direction) around a target word. Collects bigrams in the form: * Left bigrams: ``(context_word, ("L", target_word))`` * Right bigrams: ``(context_word, ("R", target_word))`` :param words: A list of Stanza Word objects. :type words: list[Word] :param target_word: The word to search for. :type target_word: str :param window_size: A tuple representing the left and right window sizes. :type window_size: tuple[int, int] :param allowed_pos: Optional set of POS tags to filter context words, defaults to None. :type allowed_pos: set[str] | None :return: A list of directed collocate tuples. :rtype: list[DirectedCollocates] """ bigrams: list[DirectedCollocates] = [] for left, right in _find_contexts(words, target_word, window_size, allowed_pos): bigrams.extend([(word, ("L", target_word)) for word in left]) bigrams.extend([(word, ("R", target_word)) for word in right]) return bigrams
[docs] def extract_undirected_bigrams( words: list[Word], target_word: str, window_size: tuple[int, int], allowed_pos: set[str] | None = None, ) -> list[Bigram]: """ Extracts undirected Bigram namedtuples surrounding a target word. :param words: A list of Stanza Word objects. :type words: list[Word] :param target_word: The word to search for. :type target_word: str :param window_size: A tuple representing the left and right window sizes. :type window_size: tuple[int, int] :param allowed_pos: Optional set of POS tags to filter context words, defaults to None. :type allowed_pos: set[str] | None :return: A list of Bigram objects containing the context word and the target word. :rtype: list[Bigram] """ bigrams: list[Bigram] = [] for left, right in _find_contexts(words, target_word, window_size, allowed_pos): bigrams.extend( [Bigram(collocate=word, target_word=target_word) for word in left + right] ) return bigrams
[docs] def extract_ngrams( words: Sequence[LexicalItem], window_size: int, **kwargs ) -> list[Collocates]: """ Splits lexical items into groups of sequential n-grams. :param words: A sequence of Stanza Word or Token objects. :type words: Sequence[LexicalItem] :param window_size: The size of the n-gram window. :type window_size: int :param kwargs: Additional keyword arguments passed to ``nltk.util.ngrams`` (e.g., ``pad_left``, ``pad_right``). For more information check: https://www.nltk.org/api/nltk.util.html#nltk.util.ngrams :return: A list of word tuples representing the consecutive n-grams. :rtype: list[Collocates] """ extracted_ngrams: list[Collocates] = cast( list[Collocates], ngrams(words, window_size, **kwargs) ) return list(extracted_ngrams)