Utils

`extract_meaningful_words(text)`

For given text, extract set of relevant keywords using nltk.

Source code in supermat/core/parser/utils.py

def extract_meaningful_words(text: str) -> set[str]:
    """For given text, extract set of relevant keywords using nltk."""
    # Tokenize the sentence
    tokens = nltk_word_tokenize(text)
    # Perform POS tagging
    tagged_tokens = nltk_pos_tag(tokens)
    # Extract words with more than 4 characters, numerics, nouns, verbs, adverbs, and adjectives excluding pronouns
    keywords = [
        word
        for word, tag in tagged_tokens
        if ((tag.startswith(("NN", "VB", "JJ", "RB")) and len(word) > 4) or (tag == "CD")) and word.lower() != "i"
    ]
    return set(keywords)

`get_keywords(text)`

For given text, retrieve relevant list of keywords using spacy and nltk.

Source code in supermat/core/parser/utils.py

def get_keywords(text: str) -> list[str]:
    """For given text, retrieve relevant list of keywords using spacy and nltk."""
    return list(extract_spacy_keywords(text) | extract_meaningful_words(text))

`split_text_into_token_chunks(text, max_tokens=8000, model_name=TOKENIZER_MODEL_NAME)`

Splits a text into chunks based on token count using LangChain's token splitter.

Parameters:

Name	Type	Description	Default
`text`	`str`	The text to be split.	required
`max_tokens`	`int`	The maximum number of tokens in each chunk.	`8000`
`model_name`	`str`	The LLM model name to determine tokenization rules.	`TOKENIZER_MODEL_NAME`

Returns:

Name	Type	Description
`list`	`list[str]`	A list of text chunks, each with up to max_tokens tokens.

Source code in supermat/core/parser/utils.py

def split_text_into_token_chunks(text, max_tokens: int = 8000, model_name: str = TOKENIZER_MODEL_NAME) -> list[str]:
    """
    Splits a text into chunks based on token count using LangChain's token splitter.

    Args:
        text (str): The text to be split.
        max_tokens (int): The maximum number of tokens in each chunk.
        model_name (str): The LLM model name to determine tokenization rules.

    Returns:
        list: A list of text chunks, each with up to max_tokens tokens.
    """
    encoding = tiktoken.encoding_for_model(model_name)
    splitter = TokenTextSplitter(encoding_name=encoding.name, chunk_size=max_tokens, chunk_overlap=0)
    chunks = splitter.split_text(text)
    return chunks