Parsed document

`BaseChunk`

Bases: CustomBaseModel

BaseChunk that contains the type and structure of the chunk.

Source code in supermat/core/models/parsed_document.py

class BaseChunk(CustomBaseModel):
    """BaseChunk that contains the type and structure of the chunk."""

    type_: Literal["Text", "Image", "Footnote"] = Field(alias="type", frozen=True)
    structure: str
    document: str = ""

    @overload
    def has_subsection(self, sub_section: BaseChunk) -> bool: ...  # noqa: U100, E704

    @overload
    def has_subsection(self, sub_section: str) -> bool: ...  # noqa: U100, E704

    def has_subsection(self, sub_section: BaseChunk | str) -> bool:
        return is_subsection(sub_section if isinstance(sub_section, str) else sub_section.structure, self.structure)

    @overload
    def is_subsection(self, section: BaseChunk) -> bool: ...  # noqa: U100, E704

    @overload
    def is_subsection(self, section: str) -> bool: ...  # noqa: U100, E704

    def is_subsection(self, section: BaseChunk | str) -> bool:
        return is_subsection(self.structure, section if isinstance(section, str) else section.structure)

`BaseChunkProperty`

Bases: CustomBaseModel

Properties assosciated with a chunk. Close to adobe's format.

Source code in supermat/core/models/parsed_document.py

class BaseChunkProperty(CustomBaseModel):
    """Properties assosciated with a chunk. Close to adobe's format."""

    object_id: int | None = Field(default=None, validation_alias=AliasChoices("ObjectID", "ObjectId"))
    bounds: tuple[float | int, float | int, float | int, float | int] = Field(validation_alias="Bounds")
    page: int = Field(validation_alias="Page")
    path: str | None = Field(default=None, validation_alias="Path")
    attributes: dict[str, Any] | None = None

`BaseTextChunk`

Bases: BaseChunk

Common TextChunk model.

Source code in supermat/core/models/parsed_document.py

class BaseTextChunk(BaseChunk):
    """Common TextChunk model."""

    text: str
    key: list[str]
    properties: BaseChunkProperty | None = None
    sentences: Sequence[ChunkModelForwardRefType] | None = None

`CustomBaseModel`

Bases: BaseModel

BaseModel with some extra tweaks. Needed this to handle previous output of parsed documents which has optional keys and needed to be saved for tests.

Source code in supermat/core/models/parsed_document.py

class CustomBaseModel(BaseModel):
    """
    BaseModel with some extra tweaks.
    Needed this to handle previous output of parsed documents which has optional keys and needed to be saved for tests.
    """

    model_config = ConfigDict(populate_by_name=True, json_schema_extra={"by_alias": True}, extra="forbid")
    _original_alias: dict[str, str] = PrivateAttr()
    _unexisted_keys: set[str] = PrivateAttr()

    def __init__(self, **data: dict[str, Any]):
        aliases: dict[str, str] = {}
        unexisted_keys: set[str] = set()
        for field_name, field in self.model_fields.items():
            alias_found = False
            if isinstance(field.validation_alias, AliasChoices):
                for alias in field.validation_alias.choices:
                    if alias in data:
                        aliases[field_name] = alias
                        alias_found = True
                        break
            elif field.alias is not None or field.validation_alias is not None:
                alias = field.alias or field.validation_alias
                if TYPE_CHECKING:
                    assert isinstance(alias, str)
                aliases[field_name] = alias
                alias_found = True

            if not ((alias_found and aliases[field_name] in data) or (field_name in data)):
                unexisted_keys.add(aliases[field_name] if alias_found else field_name)

        super().__init__(**data)
        self._original_alias = aliases
        self._unexisted_keys = unexisted_keys

    @model_serializer(mode="wrap")
    def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> dict[str, Any]:
        """This custom serializer ensures that extra keys are included as well."""
        serialized = nxt(self)
        aliased_values = {
            renamed_field_name: serialized.pop(field_name)
            for field_name, renamed_field_name in self._original_alias.items()
            if field_name in serialized
        }
        serialized.update(aliased_values)
        _unexisted_keys = self._unexisted_keys - {
            field.alias or field_name for field_name, field in self.model_fields.items() if field.frozen
        }
        cleaned_serialized = {
            field_name: value for field_name, value in serialized.items() if field_name not in _unexisted_keys
        }
        return cleaned_serialized

`serialize_model(nxt)`

This custom serializer ensures that extra keys are included as well.

Source code in supermat/core/models/parsed_document.py

@model_serializer(mode="wrap")
def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> dict[str, Any]:
    """This custom serializer ensures that extra keys are included as well."""
    serialized = nxt(self)
    aliased_values = {
        renamed_field_name: serialized.pop(field_name)
        for field_name, renamed_field_name in self._original_alias.items()
        if field_name in serialized
    }
    serialized.update(aliased_values)
    _unexisted_keys = self._unexisted_keys - {
        field.alias or field_name for field_name, field in self.model_fields.items() if field.frozen
    }
    cleaned_serialized = {
        field_name: value for field_name, value in serialized.items() if field_name not in _unexisted_keys
    }
    return cleaned_serialized

`FontProperties`

Bases: CustomBaseModel

Font properties in a TextChunkProperty.

Source code in supermat/core/models/parsed_document.py

class FontProperties(CustomBaseModel):
    """Font properties in a TextChunkProperty."""

    model_config = ConfigDict(extra="allow")
    alt_family_name: str | None = None
    embedded: bool | None = None
    encoding: str | None = None
    family_name: str | None = None
    font_type: str | None = None
    italic: bool | None = None
    monospaced: bool | None = None
    name: str
    subset: bool | None = None
    weight: int | None = None

`FootnoteChunk`

Bases: TextChunk

TextChunk which is a Footnote

Source code in supermat/core/models/parsed_document.py

class FootnoteChunk(TextChunk):
    """TextChunk which is a Footnote"""

    type_: Literal["Footnote"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Footnote", alias="type", frozen=True
    )

`ImageChunk`

Bases: BaseChunk, BaseChunkProperty

ImageChunk that stores the image in Base64 encoding.

Source code in supermat/core/models/parsed_document.py

class ImageChunk(BaseChunk, BaseChunkProperty):
    """ImageChunk that stores the image in Base64 encoding."""

    type_: Literal["Image"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Image", alias="type", frozen=True
    )
    figure: str | None = None
    figure_object: Base64Bytes | None = Field(validation_alias="figure-object", repr=False)

    @field_validator("figure_object", mode="before")
    @classmethod
    def validate_data(cls, value: Base64Bytes | None, info: ValidationInfo):  # noqa: U100
        # TODO (@legendof-selda): figure out a way to find the path where this fails.
        # NOTE: This shouldn't be allowed, but in the sample we have a case where the images aren't saved.
        if value is None:
            warn(f"{info.field_name} is None.", ValidationWarning)
            return None
        return value

`TextChunk`

Bases: BaseTextChunk

TextChunk which was similar to the initial version of supermat.

Source code in supermat/core/models/parsed_document.py

class TextChunk(BaseTextChunk):
    """TextChunk which was similar to the initial version of supermat."""

    type_: Literal["Text"] = Field(  # pyright: ignore[reportIncompatibleVariableOverride]
        default="Text", alias="type", frozen=True
    )
    speaker: dict[str, Any] | None = None
    timestamp: str | None = None
    annotations: list[str] | None = None
    properties: TextChunkProperty | None = None  # pyright: ignore[reportIncompatibleVariableOverride]

`TextChunkProperty`

Bases: BaseChunkProperty

Properties assosciated to a TextChunk

Source code in supermat/core/models/parsed_document.py

class TextChunkProperty(BaseChunkProperty):
    """Properties assosciated to a TextChunk"""

    font: FontProperties = Field(validation_alias="Font")
    hasclip: bool | None = Field(default=None, validation_alias="HasClip")
    lang: str | None = Field(default=None, validation_alias="Lang")
    text_size: float | int = Field(validation_alias="TextSize")

`ValidationWarning`

Bases: UserWarning

Custom warning for validation issues in Pydantic models.

Source code in supermat/core/models/parsed_document.py

class ValidationWarning(UserWarning):
    """Custom warning for validation issues in Pydantic models."""

`export_parsed_document(document, output_path, **kwargs)`

Export given ParsedDocument to a json file

Parameters:

Name	Type	Description	Default
`document`	`ParsedDocumentType`	The ParsedDocument to be dumped.	required
`output_path`	`Path \| str`	JSON file location.	required

Source code in supermat/core/models/parsed_document.py

def export_parsed_document(document: ParsedDocumentType, output_path: Path | str, **kwargs):
    """Export given ParsedDocument to a json file

    Args:
        document (ParsedDocumentType): The ParsedDocument to be dumped.
        output_path (Path | str): JSON file location.
    """
    output_path = Path(output_path)
    with output_path.open("wb+") as fp:
        fp.write(ParsedDocument.dump_json(document, **kwargs))

`load_parsed_document(path)`

Load a json dumped ParsedDocument

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	file path to the json file.	required

Returns:

Name	Type	Description
`ParsedDocumentType`	`ParsedDocumentType`	ParsedDocument model loaded from json.

Source code in supermat/core/models/parsed_document.py

def load_parsed_document(path: Path | str) -> ParsedDocumentType:
    """Load a json dumped `ParsedDocument`

    Args:
        path (Path | str): file path to the json file.

    Returns:
        ParsedDocumentType: ParsedDocument model loaded from json.
    """
    path = Path(path)
    with path.open("rb") as fp:
        raw_doc: list[dict[str, Any]] | dict[str, list[dict[str, Any]]] = orjson.loads(fp.read())

    if isinstance(raw_doc, dict) and len(raw_doc.keys()) == 1:
        root_key = next(iter(raw_doc.keys()))
        warn(f"The json document contains a root node {next(iter(raw_doc.keys()))}.", ValidationWarning)
        return ParsedDocument.validate_python(raw_doc[root_key])
    elif isinstance(raw_doc, list):
        return ParsedDocument.validate_python(raw_doc)
    else:
        raise ValueError("Invalid JSON Format")