Skip to content

Utils

extract_meaningful_words(text)

For given text, extract set of relevant keywords using nltk.

Source code in supermat/core/parser/utils.py
54
55
56
57
58
59
60
61
62
63
64
65
66
def extract_meaningful_words(text: str) -> set[str]:
    """For given text, extract set of relevant keywords using nltk."""
    # Tokenize the sentence
    tokens = nltk_word_tokenize(text)
    # Perform POS tagging
    tagged_tokens = nltk_pos_tag(tokens)
    # Extract words with more than 4 characters, numerics, nouns, verbs, adverbs, and adjectives excluding pronouns
    keywords = [
        word
        for word, tag in tagged_tokens
        if ((tag.startswith(("NN", "VB", "JJ", "RB")) and len(word) > 4) or (tag == "CD")) and word.lower() != "i"
    ]
    return set(keywords)

get_keywords(text)

For given text, retrieve relevant list of keywords using spacy and nltk.

Source code in supermat/core/parser/utils.py
69
70
71
def get_keywords(text: str) -> list[str]:
    """For given text, retrieve relevant list of keywords using spacy and nltk."""
    return list(extract_spacy_keywords(text) | extract_meaningful_words(text))

split_text_into_token_chunks(text, max_tokens=8000, model_name=TOKENIZER_MODEL_NAME)

Splits a text into chunks based on token count using LangChain's token splitter.

Parameters:

Name Type Description Default
text str

The text to be split.

required
max_tokens int

The maximum number of tokens in each chunk.

8000
model_name str

The LLM model name to determine tokenization rules.

TOKENIZER_MODEL_NAME

Returns:

Name Type Description
list list[str]

A list of text chunks, each with up to max_tokens tokens.

Source code in supermat/core/parser/utils.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def split_text_into_token_chunks(text, max_tokens: int = 8000, model_name: str = TOKENIZER_MODEL_NAME) -> list[str]:
    """
    Splits a text into chunks based on token count using LangChain's token splitter.

    Args:
        text (str): The text to be split.
        max_tokens (int): The maximum number of tokens in each chunk.
        model_name (str): The LLM model name to determine tokenization rules.

    Returns:
        list: A list of text chunks, each with up to max_tokens tokens.
    """
    encoding = tiktoken.encoding_for_model(model_name)
    splitter = TokenTextSplitter(encoding_name=encoding.name, chunk_size=max_tokens, chunk_overlap=0)
    chunks = splitter.split_text(text)
    return chunks