Source code for dstk.modules.text_processor
"""
This module provides utility functions for processing tokenized or lemmatized text represented as lists of strings
or POS-tagged tuples. It supports common text normalization and transformation tasks, such as lowercasing,
vocabulary extraction, and joining tokens into a single string. Additionally, it includes functionality for saving
processed text or tagged data to a file in plain text or CSV format.
Core functionalities include:
* Converting spaCy tokens to strings (with optional lemmatization)
* Lowercasing and vocabulary extraction
* Joining word lists into full text strings
* Saving word lists or (token, POS) pairs to disk in a consistent format
This module is useful for preparing text data for further analysis, modeling, or storage.
"""
from pathlib import Path
from ..lib_types.dstk_types import Words, POSTaggedWordList, Token, POSTaggedWord
[docs]
def tokens_to_text(tokens: Words[Token], lemmatize: bool = False) -> Words[str]:
"""
Converts a list of spaCy Token objects to a list of words represented as strings.
:param tokens: A list of spaCy tokens.
:type tokens: Words[Token]
:param lemmatize: Whether to return the lemmatized form of each token. Defaults to False.
:type lemmatize: bool
:return: A list words represented as strings.
:rtype: Words[str]
"""
return [token.lemma_.lower() if lemmatize else token.text for token in tokens]
[docs]
def to_lower(words: Words[str]) -> Words[str]:
"""
Returns a list of lower cased words.
:param words: A list words represented as strings.
:type words: Words[str]
:return: A list of words represented as strings.
:rtype: Words[str]
"""
return [word.lower() for word in words]
[docs]
def get_vocabulary(words: Words[str]) -> Words[str]:
"""
Returns the vocabulary a text.
:param words: A list words represented as strings.
:type words: Words[str]
:return: A list of words represented as strings.
:rtype: Words[str]
"""
return sorted(set(words))
[docs]
def join(words: Words[str]) -> str:
"""
Joins a list of strings into a single string text.
:param words: A list words represented as strings.
:type words: Words[str]
:return: A single string formed by concatenating the input words separated by spaces.
:rtype: Words[str]
"""
return " ".join(words)
[docs]
def save_to_file(words: Words[str] | POSTaggedWordList, path: str) -> str:
"""
Saves a list of strings or (Token, POS) tuples in the specified path. If tokens is a list of strings, it saves each string in a new line. If it is a list of tuples, it saves each tuple in a new line as a pair or values separated by a comma, in a CSV format.
:param words: A list words represented as strings or a list of POSTaggedWord tuples.
:type words: Words[str] or POSTaggedWordList.
:param path: The path where to save the list of words.
:type path: str
:return: The path where the file was saved.
:rtype: str
"""
with open(path, "w") as file:
for word in words:
if type(word) == str:
file.write(word + "\n")
elif isinstance(word, POSTaggedWord):
if isinstance(word[0], str):
file.write(word[0] + "," + word[1] + "\n")
else:
raise ValueError("You can only use save_to_file with a POSTaggedWordList if word is of type of str.")
else:
raise ValueError("You can only use save_to_file with Words[srt] | POSTaggedWordList")
return str(Path(path).resolve())