Parser

The parser submodule contains all Parser implementation that converts a given file type to a ParsedDocument. For the Parser to be registered, it needs to be included here. TODO (@legendof-selda): Dynamically register all parsers.

To create a new Parser, create a submodule for it and inside the submodule, it should have parser.py. Here is where the Parser implementation will be written. For any utilities associated to that parser will go to utils.py. Also include import the Parser in its corresponding __init__.py file for easier importing.

`FileProcessor`

Source code in supermat/core/parser/file_processor.py

class FileProcessor:
    _registered_handlers: dict[str, Handler] = {}
    _handlers: dict[str, list[str]] = defaultdict(list)
    _main_handlers: dict[str, str] = {}
    _file_extension_pattern = re.compile(r"^\.[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*$")

    @staticmethod
    def _register(handler: Handler, extension: str, main: bool):
        FileProcessor._registered_handlers[handler.name] = handler
        FileProcessor._handlers[extension].append(handler.name)
        if main:
            FileProcessor._main_handlers[extension] = handler.name

    @staticmethod
    def register(
        extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
    ) -> Callable[[P], P]:
        """A `register` decorator that registers a `Parser` to specified document `extension` type
        and the list of `Converter`s that needs to run beforing parsing the document.

        Example:

        ```python
        @FileProcessor.register(".html")
        @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
        @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
        class HTMLParser(Parser):
            def parse(self, file_path: Path) -> ParsedDocumentType:
                ...
        ```

        Args:
            extension (str): The file extension that the parser will handle.
            converters (type[Converter] | Iterable[type[Converter]] | None, optional):
                List of `Converter`s that converts a given file first before parsing it. Defaults to None.
            main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
                You can have multiple parsers for the same extension but only one of them can be the 'main' one.
                Defaults to False.

        Returns:
            Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
        """
        # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
        extension = extension.lower()
        if not extension.startswith("."):
            extension = f".{extension}"
        if not FileProcessor._file_extension_pattern.match(extension):
            raise ValueError(f"Invalid file extension: {extension}")
        if converters is not None and not isinstance(converters, Iterable):
            converters = (converters,)
        if converters is not None and (
            not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
        ):
            raise TypeError(f"{not_converters} are not subclasses of {Converter}")
        if main and extension in FileProcessor._main_handlers:
            raise ValueError(
                f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
                "Only one main parser can be registered for given extension."
            )

        def decorator(parser: P) -> P:
            if not issubclass(parser, Parser):
                raise TypeError(f"{parser} is not a subclass of {Parser}")
            handler = Handler(
                parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
            )
            FileProcessor._register(handler, extension, main=main)
            return parser

        return decorator

    @staticmethod
    def get_main_handler(file_path: Path | str) -> Handler:
        """Get the 'main' handler that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            Handler: The main handler associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        handler_id = FileProcessor._main_handlers.get(file_ext, None)
        if handler_id is None:
            raise ValueError(f"No main handler registered for file type: {file_ext}")

        return FileProcessor._registered_handlers[handler_id]

    @staticmethod
    def get_handler(handler_name: str) -> Handler:
        """Retrieve the registered handler from the given `handler_name`.

        Args:
            handler_name (str): Unique name given to the registered `Handler`.

        Returns:
            Handler: The registered `Handler`.
        """
        return FileProcessor._registered_handlers[handler_name]

    @staticmethod
    def get_handlers(file_path: Path | str) -> dict[str, Handler]:
        """Get all the handlers that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            dict[str, Handler]: The handlers associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        return {
            handle_name: FileProcessor.get_handler(handle_name)
            for handle_name in FileProcessor._handlers.get(file_ext, [])
        }

    @staticmethod
    def parse_file(file_path: Path | str) -> ParsedDocumentType:
        """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            ParsedDocumentType: The parsed format of the file.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.parse_file(file_path)

    @staticmethod
    def process_file(file_path: Path | str, **kwargs) -> Path:
        """Parses a file and saves the `ParsedDocument` json and returns the file path to it
        after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.process_file(file_path, **kwargs)

`get_handler(handler_name)` `staticmethod`

Retrieve the registered handler from the given handler_name.

Parameters:

Name	Type	Description	Default
`handler_name`	`str`	Unique name given to the registered `Handler`.	required

Returns:

Name	Type	Description
`Handler`	`Handler`	The registered `Handler`.

Source code in supermat/core/parser/file_processor.py

@staticmethod
def get_handler(handler_name: str) -> Handler:
    """Retrieve the registered handler from the given `handler_name`.

    Args:
        handler_name (str): Unique name given to the registered `Handler`.

    Returns:
        Handler: The registered `Handler`.
    """
    return FileProcessor._registered_handlers[handler_name]

`get_handlers(file_path)` `staticmethod`

Get all the handlers that can handle the given file.

Parameters:

Name	Type	Description	Default
`file_path`	`Path \| str`	The file that needs to be handled.	required

Returns:

Type	Description
`dict[str, Handler]`	dict[str, Handler]: The handlers associated with this file type.

Source code in supermat/core/parser/file_processor.py

@staticmethod
def get_handlers(file_path: Path | str) -> dict[str, Handler]:
    """Get all the handlers that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        dict[str, Handler]: The handlers associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    return {
        handle_name: FileProcessor.get_handler(handle_name)
        for handle_name in FileProcessor._handlers.get(file_ext, [])
    }

`get_main_handler(file_path)` `staticmethod`

Get the 'main' handler that can handle the given file.

Parameters:

Name	Type	Description	Default
`file_path`	`Path \| str`	The file that needs to be handled.	required

Returns:

Name	Type	Description
`Handler`	`Handler`	The main handler associated with this file type.

Source code in supermat/core/parser/file_processor.py

@staticmethod
def get_main_handler(file_path: Path | str) -> Handler:
    """Get the 'main' handler that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        Handler: The main handler associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    handler_id = FileProcessor._main_handlers.get(file_ext, None)
    if handler_id is None:
        raise ValueError(f"No main handler registered for file type: {file_ext}")

    return FileProcessor._registered_handlers[handler_id]

`parse_file(file_path)` `staticmethod`

Parses a file and returns the ParsedDocument after retrieving the 'main' handler for it.

Parameters:

Name	Type	Description	Default
`file_path`	`Path \| str`	The file_path that needs to be parsed.	required

Returns:

Name	Type	Description
`ParsedDocumentType`	`ParsedDocumentType`	The parsed format of the file.

Source code in supermat/core/parser/file_processor.py

@staticmethod
def parse_file(file_path: Path | str) -> ParsedDocumentType:
    """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        ParsedDocumentType: The parsed format of the file.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.parse_file(file_path)

`process_file(file_path, **kwargs)` `staticmethod`

Parses a file and saves the ParsedDocument json and returns the file path to it after retrieving the 'main' handler for it.

Parameters:

Name	Type	Description	Default
`file_path`	`Path \| str`	The file_path that needs to be parsed.	required

Returns:

Name	Type	Description
`Path`	`Path`	The path to the json exported `ParsedDocument` which is nearby the given `file_path`.

Source code in supermat/core/parser/file_processor.py

@staticmethod
def process_file(file_path: Path | str, **kwargs) -> Path:
    """Parses a file and saves the `ParsedDocument` json and returns the file path to it
    after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.process_file(file_path, **kwargs)

`register(extension, *, converters=None, main=False)` `staticmethod`

A register decorator that registers a Parser to specified document extension type and the list of Converters that needs to run beforing parsing the document.

Example:

@FileProcessor.register(".html")
@FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
@FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
class HTMLParser(Parser):
    def parse(self, file_path: Path) -> ParsedDocumentType:
        ...

Parameters:

Name	Type	Description	Default
`extension`	`str`	The file extension that the parser will handle.	required
`converters`	`type[Converter] \| Iterable[type[Converter]] \| None`	List of `Converter`s that converts a given file first before parsing it. Defaults to None.	`None`
`main`	`bool`	Specifies if the decorated `Parser` is the 'main' parser for this extension type. You can have multiple parsers for the same extension but only one of them can be the 'main' one. Defaults to False.	`False`

Returns:

Type	Description
`Callable[[P], P]`	Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser

Source code in supermat/core/parser/file_processor.py

@staticmethod
def register(
    extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
) -> Callable[[P], P]:
    """A `register` decorator that registers a `Parser` to specified document `extension` type
    and the list of `Converter`s that needs to run beforing parsing the document.

    Example:

    ```python
    @FileProcessor.register(".html")
    @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
    @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
    class HTMLParser(Parser):
        def parse(self, file_path: Path) -> ParsedDocumentType:
            ...
    ```

    Args:
        extension (str): The file extension that the parser will handle.
        converters (type[Converter] | Iterable[type[Converter]] | None, optional):
            List of `Converter`s that converts a given file first before parsing it. Defaults to None.
        main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
            You can have multiple parsers for the same extension but only one of them can be the 'main' one.
            Defaults to False.

    Returns:
        Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
    """
    # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
    extension = extension.lower()
    if not extension.startswith("."):
        extension = f".{extension}"
    if not FileProcessor._file_extension_pattern.match(extension):
        raise ValueError(f"Invalid file extension: {extension}")
    if converters is not None and not isinstance(converters, Iterable):
        converters = (converters,)
    if converters is not None and (
        not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
    ):
        raise TypeError(f"{not_converters} are not subclasses of {Converter}")
    if main and extension in FileProcessor._main_handlers:
        raise ValueError(
            f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
            "Only one main parser can be registered for given extension."
        )

    def decorator(parser: P) -> P:
        if not issubclass(parser, Parser):
            raise TypeError(f"{parser} is not a subclass of {Parser}")
        handler = Handler(
            parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
        )
        FileProcessor._register(handler, extension, main=main)
        return parser

    return decorator

`PyMuPDFParser`

Bases: Parser

Parses a pdf file using PyMuPDF library.

Source code in supermat/core/parser/pymupdf_parser/parser.py

@FileProcessor.register(".pdf")
class PyMuPDFParser(Parser):
    """Parses a pdf file using PyMuPDF library."""

    def parse(self, file_path: Path) -> ParsedDocumentType:
        parsed_pdf = parse_pdf(file_path)
        return process_pymupdf(parsed_pdf, document_name=file_path.stem)

Parser

FileProcessor

get_handler(handler_name) staticmethod

get_handlers(file_path) staticmethod

get_main_handler(file_path) staticmethod

parse_file(file_path) staticmethod

process_file(file_path, **kwargs) staticmethod

register(extension, *, converters=None, main=False) staticmethod

PyMuPDFParser

`FileProcessor`

`get_handler(handler_name)` `staticmethod`

`get_handlers(file_path)` `staticmethod`

`get_main_handler(file_path)` `staticmethod`

`parse_file(file_path)` `staticmethod`

`process_file(file_path, **kwargs)` `staticmethod`

`register(extension, *, converters=None, main=False)` `staticmethod`

`PyMuPDFParser`