Source code for dstk.parameters.context.extraction.linguistic.word.window
"""
This module provides tools for extracting context based on "Linguistic Units" by
identifying "Window-based collocates." Within the framework of distributional semantics,
this module focuses on Lexeme contexts—extracting words that appear in close
proximity to a target word within a defined window.
Core functionalities include:
* Extracting collocates from specified left and right windows around a target word.
* Filtering context windows based on part-of-speech (POS) tags to refine linguistic data.
* Generating directed bigrams, which specify the directional relationship between
a context word and a target word (e.g., Left or Right).
* Extracting undirected bigrams where only physical proximity is considered.
* Generating n-grams from sequences of lexical items for fixed-length
proximity analysis.
The module is specifically tailored for Stanza Word objects and sequences of Lexical Items.
"""
from nltk.util import ngrams
from typing import Sequence, cast
from ......lib_types import (
Collocates,
DirectedCollocates,
Bigram,
Word,
ContextGenerator,
LexicalItem,
)
def _find_contexts(
words: list[Word],
target_word: str,
window_size: tuple[int, int],
allowed_pos: set[str] | None = None,
) -> ContextGenerator:
"""
Yields left and right contexts for each occurrence of a target word.
:param words: A list of Stanza Word objects.
:type words: list[Word]
:param target_word: The text of the word to find within the list.
:type target_word: str
:param window_size: A tuple representing the number of words to include before and after.
:type window_size: tuple[int, int]
:param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
:type allowed_pos: set[str] | None
:yield: A generator yielding tuples of left and right context lists.
:rtype: ContextGenerator
"""
for index, word in enumerate(words):
left_pos_count: int = 0
right_pos_count: int = 0
left_context: list[Word] = []
right_context: list[Word] = []
if word.text == target_word:
if allowed_pos:
for left_index in range(index - 1, -1, -1):
if left_pos_count == window_size[0]:
left_context = sorted(left_context, key=lambda word: word.id)
break
left_context.append(words[left_index])
if words[left_index].upos in allowed_pos:
left_pos_count += 1
for right_index in range(index + 1, len(words)):
if right_pos_count == window_size[1]:
break
right_context.append(words[right_index])
if words[right_index].upos in allowed_pos:
right_pos_count += 1
else:
start: int = max(0, index - window_size[0])
end: int = min(len(words), index + window_size[1] + 1)
left_context = words[start:index]
right_context = words[index + 1 : end]
yield (left_context, right_context)
[docs]
def extract_collocates(
words: list[Word],
target_word: str,
window_size: tuple[int, int],
allowed_pos: set[str] | None = None,
) -> list[Collocates]:
"""
Extracts context words around a target word as flat tuples.
:param words: A list of Stanza Word objects.
:type words: list[Word]
:param target_word: The word to find within the list.
:type target_word: str
:param window_size: A tuple representing the left and right window sizes.
:type window_size: tuple[int, int]
:param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
:type allowed_pos: set[str] | None
:return: A list of word tuples matching the window constraints.
:rtype: list[Collocates]
"""
return [
tuple(left + right)
for left, right in _find_contexts(words, target_word, window_size, allowed_pos)
]
[docs]
def extract_directed_bigrams(
words: list[Word],
target_word: str,
window_size: tuple[int, int],
allowed_pos: set[str] | None = None,
) -> list[DirectedCollocates]:
"""
Extracts directed bigrams (tagged with context direction) around a target word.
Collects bigrams in the form:
* Left bigrams: ``(context_word, ("L", target_word))``
* Right bigrams: ``(context_word, ("R", target_word))``
:param words: A list of Stanza Word objects.
:type words: list[Word]
:param target_word: The word to search for.
:type target_word: str
:param window_size: A tuple representing the left and right window sizes.
:type window_size: tuple[int, int]
:param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
:type allowed_pos: set[str] | None
:return: A list of directed collocate tuples.
:rtype: list[DirectedCollocates]
"""
bigrams: list[DirectedCollocates] = []
for left, right in _find_contexts(words, target_word, window_size, allowed_pos):
bigrams.extend([(word, ("L", target_word)) for word in left])
bigrams.extend([(word, ("R", target_word)) for word in right])
return bigrams
[docs]
def extract_undirected_bigrams(
words: list[Word],
target_word: str,
window_size: tuple[int, int],
allowed_pos: set[str] | None = None,
) -> list[Bigram]:
"""
Extracts undirected Bigram namedtuples surrounding a target word.
:param words: A list of Stanza Word objects.
:type words: list[Word]
:param target_word: The word to search for.
:type target_word: str
:param window_size: A tuple representing the left and right window sizes.
:type window_size: tuple[int, int]
:param allowed_pos: Optional set of POS tags to filter context words, defaults to None.
:type allowed_pos: set[str] | None
:return: A list of Bigram objects containing the context word and the target word.
:rtype: list[Bigram]
"""
bigrams: list[Bigram] = []
for left, right in _find_contexts(words, target_word, window_size, allowed_pos):
bigrams.extend(
[Bigram(collocate=word, target_word=target_word) for word in left + right]
)
return bigrams
[docs]
def extract_ngrams(
words: Sequence[LexicalItem], window_size: int, **kwargs
) -> list[Collocates]:
"""
Splits lexical items into groups of sequential n-grams.
:param words: A sequence of Stanza Word or Token objects.
:type words: Sequence[LexicalItem]
:param window_size: The size of the n-gram window.
:type window_size: int
:param kwargs: Additional keyword arguments passed to ``nltk.util.ngrams`` (e.g., ``pad_left``, ``pad_right``).
For more information check: https://www.nltk.org/api/nltk.util.html#nltk.util.ngrams
:return: A list of word tuples representing the consecutive n-grams.
:rtype: list[Collocates]
"""
extracted_ngrams: list[Collocates] = cast(
list[Collocates], ngrams(words, window_size, **kwargs)
)
return list(extracted_ngrams)