Skip to content

File processor

FileProcessor provides an easy to use API to convert any given document by selecting it's appropriate Parser after it is converted to a compatible format using the Converter.

FileProcessor

Source code in supermat/core/parser/file_processor.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class FileProcessor:
    _registered_handlers: dict[str, Handler] = {}
    _handlers: dict[str, list[str]] = defaultdict(list)
    _main_handlers: dict[str, str] = {}
    _file_extension_pattern = re.compile(r"^\.[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*$")

    @staticmethod
    def _register(handler: Handler, extension: str, main: bool):
        FileProcessor._registered_handlers[handler.name] = handler
        FileProcessor._handlers[extension].append(handler.name)
        if main:
            FileProcessor._main_handlers[extension] = handler.name

    @staticmethod
    def register(
        extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
    ) -> Callable[[P], P]:
        """A `register` decorator that registers a `Parser` to specified document `extension` type
        and the list of `Converter`s that needs to run beforing parsing the document.

        Example:

        ```python
        @FileProcessor.register(".html")
        @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
        @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
        class HTMLParser(Parser):
            def parse(self, file_path: Path) -> ParsedDocumentType:
                ...
        ```

        Args:
            extension (str): The file extension that the parser will handle.
            converters (type[Converter] | Iterable[type[Converter]] | None, optional):
                List of `Converter`s that converts a given file first before parsing it. Defaults to None.
            main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
                You can have multiple parsers for the same extension but only one of them can be the 'main' one.
                Defaults to False.

        Returns:
            Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
        """
        # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
        extension = extension.lower()
        if not extension.startswith("."):
            extension = f".{extension}"
        if not FileProcessor._file_extension_pattern.match(extension):
            raise ValueError(f"Invalid file extension: {extension}")
        if converters is not None and not isinstance(converters, Iterable):
            converters = (converters,)
        if converters is not None and (
            not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
        ):
            raise TypeError(f"{not_converters} are not subclasses of {Converter}")
        if main and extension in FileProcessor._main_handlers:
            raise ValueError(
                f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
                "Only one main parser can be registered for given extension."
            )

        def decorator(parser: P) -> P:
            if not issubclass(parser, Parser):
                raise TypeError(f"{parser} is not a subclass of {Parser}")
            handler = Handler(
                parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
            )
            FileProcessor._register(handler, extension, main=main)
            return parser

        return decorator

    @staticmethod
    def get_main_handler(file_path: Path | str) -> Handler:
        """Get the 'main' handler that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            Handler: The main handler associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        handler_id = FileProcessor._main_handlers.get(file_ext, None)
        if handler_id is None:
            raise ValueError(f"No main handler registered for file type: {file_ext}")

        return FileProcessor._registered_handlers[handler_id]

    @staticmethod
    def get_handler(handler_name: str) -> Handler:
        """Retrieve the registered handler from the given `handler_name`.

        Args:
            handler_name (str): Unique name given to the registered `Handler`.

        Returns:
            Handler: The registered `Handler`.
        """
        return FileProcessor._registered_handlers[handler_name]

    @staticmethod
    def get_handlers(file_path: Path | str) -> dict[str, Handler]:
        """Get all the handlers that can handle the given file.

        Args:
            file_path (Path | str): The file that needs to be handled.

        Returns:
            dict[str, Handler]: The handlers associated with this file type.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()

        return {
            handle_name: FileProcessor.get_handler(handle_name)
            for handle_name in FileProcessor._handlers.get(file_ext, [])
        }

    @staticmethod
    def parse_file(file_path: Path | str) -> ParsedDocumentType:
        """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            ParsedDocumentType: The parsed format of the file.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.parse_file(file_path)

    @staticmethod
    def process_file(file_path: Path | str, **kwargs) -> Path:
        """Parses a file and saves the `ParsedDocument` json and returns the file path to it
        after retrieving the 'main' handler for it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
        """
        handler = FileProcessor.get_main_handler(file_path)
        return handler.process_file(file_path, **kwargs)

get_handler(handler_name) staticmethod

Retrieve the registered handler from the given handler_name.

Parameters:

Name Type Description Default
handler_name str

Unique name given to the registered Handler.

required

Returns:

Name Type Description
Handler Handler

The registered Handler.

Source code in supermat/core/parser/file_processor.py
184
185
186
187
188
189
190
191
192
193
194
@staticmethod
def get_handler(handler_name: str) -> Handler:
    """Retrieve the registered handler from the given `handler_name`.

    Args:
        handler_name (str): Unique name given to the registered `Handler`.

    Returns:
        Handler: The registered `Handler`.
    """
    return FileProcessor._registered_handlers[handler_name]

get_handlers(file_path) staticmethod

Get all the handlers that can handle the given file.

Parameters:

Name Type Description Default
file_path Path | str

The file that needs to be handled.

required

Returns:

Type Description
dict[str, Handler]

dict[str, Handler]: The handlers associated with this file type.

Source code in supermat/core/parser/file_processor.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
@staticmethod
def get_handlers(file_path: Path | str) -> dict[str, Handler]:
    """Get all the handlers that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        dict[str, Handler]: The handlers associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    return {
        handle_name: FileProcessor.get_handler(handle_name)
        for handle_name in FileProcessor._handlers.get(file_ext, [])
    }

get_main_handler(file_path) staticmethod

Get the 'main' handler that can handle the given file.

Parameters:

Name Type Description Default
file_path Path | str

The file that needs to be handled.

required

Returns:

Name Type Description
Handler Handler

The main handler associated with this file type.

Source code in supermat/core/parser/file_processor.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
@staticmethod
def get_main_handler(file_path: Path | str) -> Handler:
    """Get the 'main' handler that can handle the given file.

    Args:
        file_path (Path | str): The file that needs to be handled.

    Returns:
        Handler: The main handler associated with this file type.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()

    handler_id = FileProcessor._main_handlers.get(file_ext, None)
    if handler_id is None:
        raise ValueError(f"No main handler registered for file type: {file_ext}")

    return FileProcessor._registered_handlers[handler_id]

parse_file(file_path) staticmethod

Parses a file and returns the ParsedDocument after retrieving the 'main' handler for it.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
ParsedDocumentType ParsedDocumentType

The parsed format of the file.

Source code in supermat/core/parser/file_processor.py
214
215
216
217
218
219
220
221
222
223
224
225
@staticmethod
def parse_file(file_path: Path | str) -> ParsedDocumentType:
    """Parses a file and returns the `ParsedDocument` after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        ParsedDocumentType: The parsed format of the file.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.parse_file(file_path)

process_file(file_path, **kwargs) staticmethod

Parses a file and saves the ParsedDocument json and returns the file path to it after retrieving the 'main' handler for it.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
Path Path

The path to the json exported ParsedDocument which is nearby the given file_path.

Source code in supermat/core/parser/file_processor.py
227
228
229
230
231
232
233
234
235
236
237
238
239
@staticmethod
def process_file(file_path: Path | str, **kwargs) -> Path:
    """Parses a file and saves the `ParsedDocument` json and returns the file path to it
    after retrieving the 'main' handler for it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
    """
    handler = FileProcessor.get_main_handler(file_path)
    return handler.process_file(file_path, **kwargs)

register(extension, *, converters=None, main=False) staticmethod

A register decorator that registers a Parser to specified document extension type and the list of Converters that needs to run beforing parsing the document.

Example:

@FileProcessor.register(".html")
@FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
@FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
class HTMLParser(Parser):
    def parse(self, file_path: Path) -> ParsedDocumentType:
        ...

Parameters:

Name Type Description Default
extension str

The file extension that the parser will handle.

required
converters type[Converter] | Iterable[type[Converter]] | None

List of Converters that converts a given file first before parsing it. Defaults to None.

None
main bool

Specifies if the decorated Parser is the 'main' parser for this extension type. You can have multiple parsers for the same extension but only one of them can be the 'main' one. Defaults to False.

False

Returns:

Type Description
Callable[[P], P]

Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser

Source code in supermat/core/parser/file_processor.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@staticmethod
def register(
    extension: str, *, converters: type[Converter] | Iterable[type[Converter]] | None = None, main: bool = False
) -> Callable[[P], P]:
    """A `register` decorator that registers a `Parser` to specified document `extension` type
    and the list of `Converter`s that needs to run beforing parsing the document.

    Example:

    ```python
    @FileProcessor.register(".html")
    @FileProcessor.register(".pdf", converters=PDF2HTMLConverter, main=True)
    @FileProcessor.register(".docx", converters=[Docx2PDFConverter, PDF2HTMLConverter])
    class HTMLParser(Parser):
        def parse(self, file_path: Path) -> ParsedDocumentType:
            ...
    ```

    Args:
        extension (str): The file extension that the parser will handle.
        converters (type[Converter] | Iterable[type[Converter]] | None, optional):
            List of `Converter`s that converts a given file first before parsing it. Defaults to None.
        main (bool, optional): Specifies if the decorated `Parser` is the 'main' parser for this extension type.
            You can have multiple parsers for the same extension but only one of them can be the 'main' one.
            Defaults to False.

    Returns:
        Callable[[type[Parser]], type[Parser]]: A decorator that registers the given Parser
    """
    # NOTE: this only works if the register has reached. Meaning we need to manually import it in __init__.py
    extension = extension.lower()
    if not extension.startswith("."):
        extension = f".{extension}"
    if not FileProcessor._file_extension_pattern.match(extension):
        raise ValueError(f"Invalid file extension: {extension}")
    if converters is not None and not isinstance(converters, Iterable):
        converters = (converters,)
    if converters is not None and (
        not_converters := [converter for converter in converters if not issubclass(converter, Converter)]
    ):
        raise TypeError(f"{not_converters} are not subclasses of {Converter}")
    if main and extension in FileProcessor._main_handlers:
        raise ValueError(
            f"{extension} is already registered to {FileProcessor._main_handlers[extension]}! "
            "Only one main parser can be registered for given extension."
        )

    def decorator(parser: P) -> P:
        if not issubclass(parser, Parser):
            raise TypeError(f"{parser} is not a subclass of {Parser}")
        handler = Handler(
            parser=parser(), converters=tuple(converter() for converter in converters) if converters else None
        )
        FileProcessor._register(handler, extension, main=main)
        return parser

    return decorator

Handler dataclass

Handler saves combination of parser and it's required Converters to process a given file document.

Source code in supermat/core/parser/file_processor.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@dataclass
class Handler:
    """
    Handler saves combination of parser and it's required `Converter`s to process a given file document.
    """

    parser: Parser
    converters: tuple[Converter, ...] | None = None

    @property
    def name(self) -> str:
        name = f"{type(self.parser).__name__}"
        if self.converters:
            name += f'[{"|".join((type(converter).__name__ for converter in self.converters))}]'
        return name

    def convert(self, file_path: Path) -> Path:
        """Takes a file_path and chains all the converters ont that file,
         saves it and returns the location of the converted file.

        Args:
            file_path (Path): The file_path that needs to be converted.

        Returns:
            Path: The path to the converted file.
        """
        return reduce(lambda r, f: f.convert(r), self.converters, file_path) if self.converters else file_path

    def parse(self, file_path: Path) -> ParsedDocumentType:
        """Parses the given file_path by the given `Parser` after being converted by the given `Converter`s.

        Args:
            file_path (Path): The file_path that needs to be parsed.

        Returns:
            ParsedDocumentType: The parsed format of the file.
        """
        return self.parser.parse(self.convert(file_path))

    def parse_file(self, file_path: Path | str) -> ParsedDocumentType:
        """Parses a file and returns the `ParsedDocument`.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            ParsedDocumentType: The parsed format of the file.
        """
        file_path = Path(file_path)
        parsed_document = self.parse(file_path)
        return parsed_document

    def process_file(self, file_path: Path | str, **kwargs) -> Path:
        """Parses a file and saves the `ParsedDocument` json and returns the file path to it.

        Args:
            file_path (Path | str): The file_path that needs to be parsed.

        Returns:
            Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
        """
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()
        parsed_document = self.parse_file(file_path)
        parsed_out_file = file_path.with_suffix(f"{file_ext}.json")
        export_parsed_document(parsed_document, parsed_out_file, **kwargs)
        return parsed_out_file

convert(file_path)

Takes a file_path and chains all the converters ont that file, saves it and returns the location of the converted file.

Parameters:

Name Type Description Default
file_path Path

The file_path that needs to be converted.

required

Returns:

Name Type Description
Path Path

The path to the converted file.

Source code in supermat/core/parser/file_processor.py
38
39
40
41
42
43
44
45
46
47
48
def convert(self, file_path: Path) -> Path:
    """Takes a file_path and chains all the converters ont that file,
     saves it and returns the location of the converted file.

    Args:
        file_path (Path): The file_path that needs to be converted.

    Returns:
        Path: The path to the converted file.
    """
    return reduce(lambda r, f: f.convert(r), self.converters, file_path) if self.converters else file_path

parse(file_path)

Parses the given file_path by the given Parser after being converted by the given Converters.

Parameters:

Name Type Description Default
file_path Path

The file_path that needs to be parsed.

required

Returns:

Name Type Description
ParsedDocumentType ParsedDocumentType

The parsed format of the file.

Source code in supermat/core/parser/file_processor.py
50
51
52
53
54
55
56
57
58
59
def parse(self, file_path: Path) -> ParsedDocumentType:
    """Parses the given file_path by the given `Parser` after being converted by the given `Converter`s.

    Args:
        file_path (Path): The file_path that needs to be parsed.

    Returns:
        ParsedDocumentType: The parsed format of the file.
    """
    return self.parser.parse(self.convert(file_path))

parse_file(file_path)

Parses a file and returns the ParsedDocument.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
ParsedDocumentType ParsedDocumentType

The parsed format of the file.

Source code in supermat/core/parser/file_processor.py
61
62
63
64
65
66
67
68
69
70
71
72
def parse_file(self, file_path: Path | str) -> ParsedDocumentType:
    """Parses a file and returns the `ParsedDocument`.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        ParsedDocumentType: The parsed format of the file.
    """
    file_path = Path(file_path)
    parsed_document = self.parse(file_path)
    return parsed_document

process_file(file_path, **kwargs)

Parses a file and saves the ParsedDocument json and returns the file path to it.

Parameters:

Name Type Description Default
file_path Path | str

The file_path that needs to be parsed.

required

Returns:

Name Type Description
Path Path

The path to the json exported ParsedDocument which is nearby the given file_path.

Source code in supermat/core/parser/file_processor.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def process_file(self, file_path: Path | str, **kwargs) -> Path:
    """Parses a file and saves the `ParsedDocument` json and returns the file path to it.

    Args:
        file_path (Path | str): The file_path that needs to be parsed.

    Returns:
        Path: The path to the json exported `ParsedDocument` which is nearby the given `file_path`.
    """
    file_path = Path(file_path)
    file_ext = file_path.suffix.lower()
    parsed_document = self.parse_file(file_path)
    parsed_out_file = file_path.with_suffix(f"{file_ext}.json")
    export_parsed_document(parsed_document, parsed_out_file, **kwargs)
    return parsed_out_file