Skip to content

Parsed document

BaseChunk

Bases: CustomBaseModel

BaseChunk that contains the type and structure of the chunk.

Source code in supermat/core/models/parsed_document.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class BaseChunk(CustomBaseModel):
    """BaseChunk that contains the type and structure of the chunk."""

    type_: Literal["Text", "Image", "Footnote"] = Field(alias="type", frozen=True)
    structure: str
    document: str = ""

    @overload
    def has_subsection(self, sub_section: BaseChunk) -> bool: ...  # noqa: U100, E704

    @overload
    def has_subsection(self, sub_section: str) -> bool: ...  # noqa: U100, E704

    def has_subsection(self, sub_section: BaseChunk | str) -> bool:
        return is_subsection(sub_section if isinstance(sub_section, str) else sub_section.structure, self.structure)

    @overload
    def is_subsection(self, section: BaseChunk) -> bool: ...  # noqa: U100, E704

    @overload
    def is_subsection(self, section: str) -> bool: ...  # noqa: U100, E704

    def is_subsection(self, section: BaseChunk | str) -> bool:
        return is_subsection(self.structure, section if isinstance(section, str) else section.structure)

BaseChunkProperty

Bases: CustomBaseModel

Properties assosciated with a chunk. Close to adobe's format.

Source code in supermat/core/models/parsed_document.py
105
106
107
108
109
110
111
112
class BaseChunkProperty(CustomBaseModel):
    """Properties assosciated with a chunk. Close to adobe's format."""

    object_id: int | None = Field(default=None, validation_alias=AliasChoices("ObjectID", "ObjectId"))
    bounds: tuple[float | int, float | int, float | int, float | int] = Field(validation_alias="Bounds")
    page: int = Field(validation_alias="Page")
    path: str | None = Field(default=None, validation_alias="Path")
    attributes: dict[str, Any] | None = None

BaseTextChunk

Bases: BaseChunk

Common TextChunk model.

Source code in supermat/core/models/parsed_document.py
171
172
173
174
175
176
177
class BaseTextChunk(BaseChunk):
    """Common TextChunk model."""

    text: str
    key: list[str]
    properties: BaseChunkProperty | None = None
    sentences: Sequence[ChunkModelForwardRefType] | None = None

CustomBaseModel

Bases: BaseModel

BaseModel with some extra tweaks. Needed this to handle previous output of parsed documents which has optional keys and needed to be saved for tests.

Source code in supermat/core/models/parsed_document.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class CustomBaseModel(BaseModel):
    """
    BaseModel with some extra tweaks.
    Needed this to handle previous output of parsed documents which has optional keys and needed to be saved for tests.
    """

    model_config = ConfigDict(populate_by_name=True, json_schema_extra={"by_alias": True}, extra="forbid")
    _original_alias: dict[str, str] = PrivateAttr()
    _unexisted_keys: set[str] = PrivateAttr()

    def __init__(self, **data: dict[str, Any]):
        aliases: dict[str, str] = {}
        unexisted_keys: set[str] = set()
        for field_name, field in self.model_fields.items():
            alias_found = False
            if isinstance(field.validation_alias, AliasChoices):
                for alias in field.validation_alias.choices:
                    if alias in data:
                        aliases[field_name] = alias
                        alias_found = True
                        break
            elif field.alias is not None or field.validation_alias is not None:
                alias = field.alias or field.validation_alias
                if TYPE_CHECKING:
                    assert isinstance(alias, str)
                aliases[field_name] = alias
                alias_found = True

            if not ((alias_found and aliases[field_name] in data) or (field_name in data)):
                unexisted_keys.add(aliases[field_name] if alias_found else field_name)

        super().__init__(**data)
        self._original_alias = aliases
        self._unexisted_keys = unexisted_keys

    @model_serializer(mode="wrap")
    def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> dict[str, Any]:
        """This custom serializer ensures that extra keys are included as well."""
        serialized = nxt(self)
        aliased_values = {
            renamed_field_name: serialized.pop(field_name)
            for field_name, renamed_field_name in self._original_alias.items()
            if field_name in serialized
        }
        serialized.update(aliased_values)
        _unexisted_keys = self._unexisted_keys - {
            field.alias or field_name for field_name, field in self.model_fields.items() if field.frozen
        }
        cleaned_serialized = {
            field_name: value for field_name, value in serialized.items() if field_name not in _unexisted_keys
        }
        return cleaned_serialized

serialize_model(nxt)

This custom serializer ensures that extra keys are included as well.

Source code in supermat/core/models/parsed_document.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@model_serializer(mode="wrap")
def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> dict[str, Any]:
    """This custom serializer ensures that extra keys are included as well."""
    serialized = nxt(self)
    aliased_values = {
        renamed_field_name: serialized.pop(field_name)
        for field_name, renamed_field_name in self._original_alias.items()
        if field_name in serialized
    }
    serialized.update(aliased_values)
    _unexisted_keys = self._unexisted_keys - {
        field.alias or field_name for field_name, field in self.model_fields.items() if field.frozen
    }
    cleaned_serialized = {
        field_name: value for field_name, value in serialized.items() if field_name not in _unexisted_keys
    }
    return cleaned_serialized

FontProperties

Bases: CustomBaseModel

Font properties in a TextChunkProperty.

Source code in supermat/core/models/parsed_document.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class FontProperties(CustomBaseModel):
    """Font properties in a TextChunkProperty."""

    model_config = ConfigDict(extra="allow")
    alt_family_name: str | None = None
    embedded: bool | None = None
    encoding: str | None = None
    family_name: str | None = None
    font_type: str | None = None
    italic: bool | None = None
    monospaced: bool | None = None
    name: str
    subset: bool | None = None
    weight: int | None = None

FootnoteChunk

Bases: TextChunk

TextChunk which is a Footnote

Source code in supermat/core/models/parsed_document.py
212
213
214
215
216
217
class FootnoteChunk(TextChunk):
    """TextChunk which is a Footnote"""

    type_: Literal["Footnote"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Footnote", alias="type", frozen=True
    )

ImageChunk

Bases: BaseChunk, BaseChunkProperty

ImageChunk that stores the image in Base64 encoding.

Source code in supermat/core/models/parsed_document.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class ImageChunk(BaseChunk, BaseChunkProperty):
    """ImageChunk that stores the image in Base64 encoding."""

    type_: Literal["Image"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Image", alias="type", frozen=True
    )
    figure: str | None = None
    figure_object: Base64Bytes | None = Field(validation_alias="figure-object", repr=False)

    @field_validator("figure_object", mode="before")
    @classmethod
    def validate_data(cls, value: Base64Bytes | None, info: ValidationInfo):  # noqa: U100
        # TODO (@legendof-selda): figure out a way to find the path where this fails.
        # NOTE: This shouldn't be allowed, but in the sample we have a case where the images aren't saved.
        if value is None:
            warn(f"{info.field_name} is None.", ValidationWarning)
            return None
        return value

TextChunk

Bases: BaseTextChunk

TextChunk which was similar to the initial version of supermat.

Source code in supermat/core/models/parsed_document.py
180
181
182
183
184
185
186
187
188
189
class TextChunk(BaseTextChunk):
    """TextChunk which was similar to the initial version of supermat."""

    type_: Literal["Text"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Text", alias="type", frozen=True
    )
    speaker: dict[str, Any] | None = None
    timestamp: str | None = None
    annotations: list[str] | None = None
    properties: TextChunkProperty | None = None  # pyright: ignore[reportIncompatibleVariableOverride]

TextChunkProperty

Bases: BaseChunkProperty

Properties assosciated to a TextChunk

Source code in supermat/core/models/parsed_document.py
131
132
133
134
135
136
137
class TextChunkProperty(BaseChunkProperty):
    """Properties assosciated to a TextChunk"""

    font: FontProperties = Field(validation_alias="Font")
    hasclip: bool | None = Field(default=None, validation_alias="HasClip")
    lang: str | None = Field(default=None, validation_alias="Lang")
    text_size: float | int = Field(validation_alias="TextSize")

ValidationWarning

Bases: UserWarning

Custom warning for validation issues in Pydantic models.

Source code in supermat/core/models/parsed_document.py
47
48
class ValidationWarning(UserWarning):
    """Custom warning for validation issues in Pydantic models."""

export_parsed_document(document, output_path, **kwargs)

Export given ParsedDocument to a json file

Parameters:

Name Type Description Default
document ParsedDocumentType

The ParsedDocument to be dumped.

required
output_path Path | str

JSON file location.

required
Source code in supermat/core/models/parsed_document.py
249
250
251
252
253
254
255
256
257
258
def export_parsed_document(document: ParsedDocumentType, output_path: Path | str, **kwargs):
    """Export given ParsedDocument to a json file

    Args:
        document (ParsedDocumentType): The ParsedDocument to be dumped.
        output_path (Path | str): JSON file location.
    """
    output_path = Path(output_path)
    with output_path.open("wb+") as fp:
        fp.write(ParsedDocument.dump_json(document, **kwargs))

load_parsed_document(path)

Load a json dumped ParsedDocument

Parameters:

Name Type Description Default
path Path | str

file path to the json file.

required

Returns:

Name Type Description
ParsedDocumentType ParsedDocumentType

ParsedDocument model loaded from json.

Source code in supermat/core/models/parsed_document.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def load_parsed_document(path: Path | str) -> ParsedDocumentType:
    """Load a json dumped `ParsedDocument`

    Args:
        path (Path | str): file path to the json file.

    Returns:
        ParsedDocumentType: ParsedDocument model loaded from json.
    """
    path = Path(path)
    with path.open("rb") as fp:
        raw_doc: list[dict[str, Any]] | dict[str, list[dict[str, Any]]] = orjson.loads(fp.read())

    if isinstance(raw_doc, dict) and len(raw_doc.keys()) == 1:
        root_key = next(iter(raw_doc.keys()))
        warn(f"The json document contains a root node {next(iter(raw_doc.keys()))}.", ValidationWarning)
        return ParsedDocument.validate_python(raw_doc[root_key])
    elif isinstance(raw_doc, list):
        return ParsedDocument.validate_python(raw_doc)
    else:
        raise ValueError("Invalid JSON Format")