Skip to content

Simple chunking

SimpleChunker

Bases: BaseChunker

A simple chunking strategy that simply takes all TextChunks in the parsed document and converts them into chunks.

Source code in supermat/core/chunking/simple_chunking.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class SimpleChunker(BaseChunker):
    """
    A simple chunking strategy that simply takes all TextChunks in the parsed document and converts them into chunks.
    """

    @staticmethod
    def build_chunk(doc_id: int, section: ChunkModelType) -> ChunkDocument:
        assert isinstance(section, BaseTextChunk)
        assert section.properties
        return ChunkDocument(
            document_id=doc_id,
            text=section.text,
            metadata=BaseChunkMetadata(
                document=section.document,
                type=section.type_,
                structure=section.structure,
                page_number=section.properties.page,
                source=section.properties.path,
                chunk_meta=section,
            ),
        )

    def create_chunks(self, processed_document: ParsedDocumentType) -> DocumentChunksType:
        return [
            SimpleChunker.build_chunk(doc_id, section)
            for doc_id, section in enumerate(processed_document)
            if isinstance(section, BaseTextChunk)
        ]