Bases: BaseChunker
A simple chunking strategy that simply takes all TextChunks in the parsed document and converts them into chunks.
Source code in supermat/core/chunking/simple_chunking.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 | class SimpleChunker(BaseChunker):
"""
A simple chunking strategy that simply takes all TextChunks in the parsed document and converts them into chunks.
"""
@staticmethod
def build_chunk(doc_id: int, section: ChunkModelType) -> ChunkDocument:
assert isinstance(section, BaseTextChunk)
assert section.properties
return ChunkDocument(
document_id=doc_id,
text=section.text,
metadata=BaseChunkMetadata(
document=section.document,
type=section.type_,
structure=section.structure,
page_number=section.properties.page,
source=section.properties.path,
chunk_meta=section,
),
)
def create_chunks(self, processed_document: ParsedDocumentType) -> DocumentChunksType:
return [
SimpleChunker.build_chunk(doc_id, section)
for doc_id, section in enumerate(processed_document)
if isinstance(section, BaseTextChunk)
]
|