Skip to content

Parser

The parser submodule contains all Parser implementation that converts a given file type to a ParsedDocument. For the Parser to be registered, it needs to be included here. TODO (@legendof-selda): Dynamically register all parsers.

To create a new Parser, create a submodule for it and inside the submodule, it should have parser.py. Here is where the Parser implementation will be written. For any utilities associated to that parser will go to utils.py. Also include import the Parser in its corresponding __init__.py file for easier importing.

FileProcessor

Source code in supermat/core/parser/file_processor.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class FileProcessor:
    _registered_handlers: dict[str, Handler] = {}
    _handlers: dict[str, list[str]] = defaultdict(list)
    _main_handlers: dict[str, str] = {}
    _file_extension_pattern = re.compile(r"^\.[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*$")

    @staticmethod
    def _register(handler: Handler, extension: str, main: bool):
        FileProcessor._registered_handlers[handler.name] = handler
        FileProcessor._handlers[extension].append(handler.name)
        if main:
            FileProcessor._main_handlers[extension] = handler.name

    @staticmethod
    def register(
        extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
    ) -> Callable[[P], P]:
        """A `register` decorator that registers a `Parser` to specified document `extension` type
        and the list of `Converter`s that needs to run beforing parsing the document.

        Example:

        ```python
        @FileProcessor.register(".html")
        @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
        @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
        class HTMLParser(Parser):
            def parse(self, file_path: Path) -> ParsedDocumentType:
                ...
        ```

        Args:
            extension (str): The file extension that the parser will handle.
            converters (type[Converter] | Iterable[type[Converter]] | None, optional):
                List of `Converter`s that converts a given file first before parsing it. Defaults to None.
            main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
                You can have multiple parsers for the same extension but only one of them can be the 'main' one.
                Defaults to False.

        Returns:
            Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
        """
        # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
        extension = extension.lower()
        if not extension.startswith("."):
            extension = f".{extension}"
        if not FileProcessor._file_extension_pattern.match(extension):
            raise ValueError(f"Invalid file extension: {extension}")
        if converters is not None and not isinstance(converters, Iterable):
            converters = (converters,)
        if converters is not None and (
            not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
        ):
            raise TypeError(f"{not_converters} are not subclasses of {Converter}")
        if main and extension in FileProcessor._main_handlers:
            raise ValueError(
                f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
                "Only one main parser can be registered for given extension."
            )

        def decorator(parser: P) -> P:
            if not issubclass(parser, Parser):
                raise TypeError(f"{parser} is not a subclass of {Parser}")
            handler = Handler(
                parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
            )
            FileProcessor._register(handler, extension, main=main)
            return parser

        return decorator

    @staticmethod
    def get_main_handler(file_path: Path | str) -> Handler:
        """Get the 'main' handler that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            Handler: The main handler associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        handler_id = FileProcessor._main_handlers.get(file_ext, None)
        if handler_id is None:
            raise ValueError(f"No main handler registered for file type: {file_ext}")

        return FileProcessor._registered_handlers[handler_id]

    @staticmethod
    def get_handler(handler_name: str) -> Handler:
        """Retrieve the registered handler from the given `handler_name`.

        Args:
            handler_name (str): Unique name given to the registered `Handler`.

        Returns:
            Handler: The registered `Handler`.
        """
        return FileProcessor._registered_handlers[handler_name]

    @staticmethod
    def get_handlers(file_path: Path | str) -> dict[str, Handler]:
        """Get all the handlers that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            dict[str, Handler]: The handlers associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        return {
            handle_name: FileProcessor.get_handler(handle_name)
            for handle_name in FileProcessor._handlers.get(file_ext, [])
        }

    @staticmethod
    def parse_file(file_path: Path | str) -> ParsedDocumentType:
        """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            ParsedDocumentType: The parsed format of the file.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.parse_file(file_path)

    @staticmethod
    def process_file(file_path: Path | str, **kwargs) -> Path:
        """Parses a file and saves the `ParsedDocument` json and returns the file path to it
        after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.process_file(file_path, **kwargs)

get_handler(handler_name) staticmethod

Retrieve the registered handler from the given handler_name.

Parameters:

Name Type Description Default
handler_name str

Unique name given to the registered Handler.

required

Returns:

Name Type Description
Handler Handler

The registered Handler.

Source code in supermat/core/parser/file_processor.py
184
185
186
187
188
189
190
191
192
193
194
@staticmethod
def get_handler(handler_name: str) -> Handler:
    """Retrieve the registered handler from the given `handler_name`.

    Args:
        handler_name (str): Unique name given to the registered `Handler`.

    Returns:
        Handler: The registered `Handler`.
    """
    return FileProcessor._registered_handlers[handler_name]

get_handlers(file_path) staticmethod

Get all the handlers that can handle the given file.

Parameters:

Name Type Description Default
file_path Path | str

The file that needs to be handled.

required

Returns:

Type Description
dict[str, Handler]

dict[str, Handler]: The handlers associated with this file type.

Source code in supermat/core/parser/file_processor.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
@staticmethod
def get_handlers(file_path: Path | str) -> dict[str, Handler]:
    """Get all the handlers that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        dict[str, Handler]: The handlers associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    return {
        handle_name: FileProcessor.get_handler(handle_name)
        for handle_name in FileProcessor._handlers.get(file_ext, [])
    }

get_main_handler(file_path) staticmethod

Get the 'main' handler that can handle the given file.

Parameters:

Name Type Description Default
file_path Path | str

The file that needs to be handled.

required

Returns:

Name Type Description
Handler Handler

The main handler associated with this file type.

Source code in supermat/core/parser/file_processor.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@staticmethod
def get_main_handler(file_path: Path | str) -> Handler:
    """Get the 'main' handler that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        Handler: The main handler associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    handler_id = FileProcessor._main_handlers.get(file_ext, None)
    if handler_id is None:
        raise ValueError(f"No main handler registered for file type: {file_ext}")

    return FileProcessor._registered_handlers[handler_id]

parse_file(file_path) staticmethod

Parses a file and returns the ParsedDocument after retrieving the 'main' handler for it.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
ParsedDocumentType ParsedDocumentType

The parsed format of the file.

Source code in supermat/core/parser/file_processor.py
214
215
216
217
218
219
220
221
222
223
224
225
@staticmethod
def parse_file(file_path: Path | str) -> ParsedDocumentType:
    """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        ParsedDocumentType: The parsed format of the file.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.parse_file(file_path)

process_file(file_path, **kwargs) staticmethod

Parses a file and saves the ParsedDocument json and returns the file path to it after retrieving the 'main' handler for it.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
Path Path

The path to the json exported ParsedDocument which is nearby the given file_path.

Source code in supermat/core/parser/file_processor.py
227
228
229
230
231
232
233
234
235
236
237
238
239
@staticmethod
def process_file(file_path: Path | str, **kwargs) -> Path:
    """Parses a file and saves the `ParsedDocument` json and returns the file path to it
    after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.process_file(file_path, **kwargs)

register(extension, *, converters=None, main=False) staticmethod

A register decorator that registers a Parser to specified document extension type and the list of Converters that needs to run beforing parsing the document.

Example:

@FileProcessor.register(".html")
@FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
@FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
class HTMLParser(Parser):
    def parse(self, file_path: Path) -> ParsedDocumentType:
        ...

Parameters:

Name Type Description Default
extension str

The file extension that the parser will handle.

required
converters type[Converter] | Iterable[type[Converter]] | None

List of Converters that converts a given file first before parsing it. Defaults to None.

None
main bool

Specifies if the decorated Parser is the 'main' parser for this extension type. You can have multiple parsers for the same extension but only one of them can be the 'main' one. Defaults to False.

False

Returns:

Type Description
Callable[[P], P]

Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser

Source code in supermat/core/parser/file_processor.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@staticmethod
def register(
    extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
) -> Callable[[P], P]:
    """A `register` decorator that registers a `Parser` to specified document `extension` type
    and the list of `Converter`s that needs to run beforing parsing the document.

    Example:

    ```python
    @FileProcessor.register(".html")
    @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
    @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
    class HTMLParser(Parser):
        def parse(self, file_path: Path) -> ParsedDocumentType:
            ...
    ```

    Args:
        extension (str): The file extension that the parser will handle.
        converters (type[Converter] | Iterable[type[Converter]] | None, optional):
            List of `Converter`s that converts a given file first before parsing it. Defaults to None.
        main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
            You can have multiple parsers for the same extension but only one of them can be the 'main' one.
            Defaults to False.

    Returns:
        Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
    """
    # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
    extension = extension.lower()
    if not extension.startswith("."):
        extension = f".{extension}"
    if not FileProcessor._file_extension_pattern.match(extension):
        raise ValueError(f"Invalid file extension: {extension}")
    if converters is not None and not isinstance(converters, Iterable):
        converters = (converters,)
    if converters is not None and (
        not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
    ):
        raise TypeError(f"{not_converters} are not subclasses of {Converter}")
    if main and extension in FileProcessor._main_handlers:
        raise ValueError(
            f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
            "Only one main parser can be registered for given extension."
        )

    def decorator(parser: P) -> P:
        if not issubclass(parser, Parser):
            raise TypeError(f"{parser} is not a subclass of {Parser}")
        handler = Handler(
            parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
        )
        FileProcessor._register(handler, extension, main=main)
        return parser

    return decorator

PyMuPDFParser

Bases: Parser

Parses a pdf file using PyMuPDF library.

Source code in supermat/core/parser/pymupdf_parser/parser.py
114
115
116
117
118
119
120
@FileProcessor.register(".pdf")
class PyMuPDFParser(Parser):
    """Parses a pdf file using PyMuPDF library."""

    def parse(self, file_path: Path) -> ParsedDocumentType:
        parsed_pdf = parse_pdf(file_path)
        return process_pymupdf(parsed_pdf, document_name=file_path.stem)