Source code for dstk.parameters.context.extraction.linguistic.word.window

"""
This module provides tools for extracting context based on "Linguistic Units" by
identifying "Window-based collocates." Within the framework of distributional semantics,
this module focuses on Lexeme contexts—extracting words that appear in close
proximity to a target word within a defined window.

Core functionalities include:

* Extracting collocates from specified left and right windows around a target word.
* Filtering context windows based on part-of-speech (POS) tags to refine linguistic data.
* Generating directed bigrams, which specify the directional relationship between
  a context word and a target word (e.g., Left or Right).
* Extracting undirected bigrams where only physical proximity is considered.
* Generating n-grams from sequences of lexical items for fixed-length
  proximity analysis.

The module is specifically tailored for Stanza Word objects and sequences of Lexical Items.
"""

from nltk.util import ngrams

from typing import Sequence, cast
from ......lib_types import (
    Collocates,
    DirectedCollocates,
    Bigram,
    Word,
    ContextGenerator,
    LexicalItem,
)


def _find_contexts(
    words: list[Word],
    target_word: str,
    window_size: tuple[int, int],
    allowed_pos: set[str] | None = None,
) -> ContextGenerator:
    """
    Yields left and right contexts for each occurrence of a target word.

    :param words: A list of Stanza Word objects.
    :type words: list[Word]
    :param target_word: The text of the word to find within the list.
    :type target_word: str
    :param window_size: A tuple representing the number of words to include before and after.
    :type window_size: tuple[int, int]
    :param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
    :type allowed_pos: set[str] | None

    :yield: A generator yielding tuples of left and right context lists.
    :rtype: ContextGenerator
    """

    for index, word in enumerate(words):
        left_pos_count: int = 0
        right_pos_count: int = 0

        left_context: list[Word] = []
        right_context: list[Word] = []

        if word.text == target_word:
            if allowed_pos:
                for left_index in range(index - 1, -1, -1):
                    if left_pos_count == window_size[0]:
                        left_context = sorted(left_context, key=lambda word: word.id)
                        break

                    left_context.append(words[left_index])
                    if words[left_index].upos in allowed_pos:
                        left_pos_count += 1

                for right_index in range(index + 1, len(words)):
                    if right_pos_count == window_size[1]:
                        break

                    right_context.append(words[right_index])
                    if words[right_index].upos in allowed_pos:
                        right_pos_count += 1
            else:
                start: int = max(0, index - window_size[0])
                end: int = min(len(words), index + window_size[1] + 1)

                left_context = words[start:index]
                right_context = words[index + 1 : end]

            yield (left_context, right_context)



[docs]
def extract_collocates(
    words: list[Word],
    target_word: str,
    window_size: tuple[int, int],
    allowed_pos: set[str] | None = None,
) -> list[Collocates]:
    """
    Extracts context words around a target word as flat tuples.

    :param words: A list of Stanza Word objects.
    :type words: list[Word]
    :param target_word: The word to find within the list.
    :type target_word: str
    :param window_size: A tuple representing the left and right window sizes.
    :type window_size: tuple[int, int]
    :param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
    :type allowed_pos: set[str] | None

    :return: A list of word tuples matching the window constraints.
    :rtype: list[Collocates]
    """

    return [
        tuple(left + right)
        for left, right in _find_contexts(words, target_word, window_size, allowed_pos)
    ]




[docs]
def extract_directed_bigrams(
    words: list[Word],
    target_word: str,
    window_size: tuple[int, int],
    allowed_pos: set[str] | None = None,
) -> list[DirectedCollocates]:
    """
    Extracts directed bigrams (tagged with context direction) around a target word.

    Collects bigrams in the form:
    * Left bigrams: ``(context_word, ("L", target_word))``
    * Right bigrams: ``(context_word, ("R", target_word))``

    :param words: A list of Stanza Word objects.
    :type words: list[Word]
    :param target_word: The word to search for.
    :type target_word: str
    :param window_size: A tuple representing the left and right window sizes.
    :type window_size: tuple[int, int]
    :param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
    :type allowed_pos: set[str] | None

    :return: A list of directed collocate tuples.
    :rtype: list[DirectedCollocates]
    """

    bigrams: list[DirectedCollocates] = []

    for left, right in _find_contexts(words, target_word, window_size, allowed_pos):
        bigrams.extend([(word, ("L", target_word)) for word in left])
        bigrams.extend([(word, ("R", target_word)) for word in right])

    return bigrams




[docs]
def extract_undirected_bigrams(
    words: list[Word],
    target_word: str,
    window_size: tuple[int, int],
    allowed_pos: set[str] | None = None,
) -> list[Bigram]:
    """
    Extracts undirected Bigram namedtuples surrounding a target word.

    :param words: A list of Stanza Word objects.
    :type words: list[Word]
    :param target_word: The word to search for.
    :type target_word: str
    :param window_size: A tuple representing the left and right window sizes.
    :type window_size: tuple[int, int]
    :param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
    :type allowed_pos: set[str] | None

    :return: A list of Bigram objects containing the context word and the target word.
    :rtype: list[Bigram]
    """

    bigrams: list[Bigram] = []

    for left, right in _find_contexts(words, target_word, window_size, allowed_pos):
        bigrams.extend(
            [Bigram(collocate=word, target_word=target_word) for word in left + right]
        )

    return bigrams




[docs]
def extract_ngrams(
    words: Sequence[LexicalItem], window_size: int, **kwargs
) -> list[Collocates]:
    """
    Splits lexical items into groups of sequential n-grams.

    :param words: A sequence of Stanza Word or Token objects.
    :type words: Sequence[LexicalItem]
    :param window_size: The size of the n-gram window.
    :type window_size: int
    :param kwargs: Additional keyword arguments passed to ``nltk.util.ngrams`` (e.g., ``pad_left``, ``pad_right``). 
    
    For more information check: https://www.nltk.org/api/nltk.util.html#nltk.util.ngrams

    :return: A list of word tuples representing the consecutive n-grams.
    :rtype: list[Collocates]
    """

    extracted_ngrams: list[Collocates] = cast(
        list[Collocates], ngrams(words, window_size, **kwargs)
    )
    return list(extracted_ngrams)