Skip to content

adobe doc cacher

Since adobe is a paid service, we cache all the outputs of a given pdf file that adobe provides to avoid calling the api

CachedFile

A singleton Cache mechanism that caches a given pdf file. The cached index contains the mapping of the original pdf file path to the processed adobe zip file.

Source code in supermat/core/parser/adobe_parser/_adobe_doc_cacher.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class CachedFile:
    """
    A singleton Cache mechanism that caches a given pdf file.
    The cached index contains the mapping of the original pdf file path to the processed adobe zip file.
    """

    _instance: Self | None = None
    _lock = threading.Lock()
    _cache_dir: Path | None = None
    _cache_index: dict[Path, CacheIndex] = {}
    _cache_index_filename = "__cache_index.json"
    max_cache_size = 100

    def __new__(cls):
        CachedFile._setup_tmp_dir()
        if not cls._instance:
            with cls._lock:
                if not cls._instance:
                    cls._instance = super().__new__(cls)
        return cls._instance

    @staticmethod
    def _setup_tmp_dir():
        if CachedFile._cache_dir is None:
            CachedFile._cache_dir = get_persistent_temp_directory()
            CachedFile.reload_cache_index()

    def __init__(self):
        assert CachedFile._cache_dir is not None
        self._cache_path = CachedFile._cache_dir

    def exists(self, pdf_file: Path) -> bool:
        return pdf_file in CachedFile._cache_index

    def get_cached_file_path(self, pdf_file: Path) -> Path:
        return CachedFile._cache_index[pdf_file]["cached_file_path"]

    @staticmethod
    def get_cache_index_path() -> Path | None:
        if CachedFile._cache_dir is None:
            return None
        return Path(CachedFile._cache_dir) / CachedFile._cache_index_filename

    @staticmethod
    def update_cache_file():
        cache_index_path = CachedFile.get_cache_index_path()
        if cache_index_path is None:
            return

        with cache_index_path.open("wb+") as fp:
            fp.write(
                orjson.dumps({k.as_posix(): v for k, v in CachedFile._cache_index.items()}, default=orjson_defaults)
            )

    @staticmethod
    def reload_cache_index():
        cache_index_path = CachedFile.get_cache_index_path()
        if cache_index_path is not None and cache_index_path.exists():
            with cache_index_path.open("rb") as fp:
                CachedFile._cache_index = {
                    Path(path): CacheIndex(**cache_index) for path, cache_index in orjson.loads(fp.read()).items()
                }

    def create_file(self, pdf_file: Path, suffix: str = ".zip") -> Path:
        cache_index = CacheIndex(
            timestamp=datetime.now().timestamp(),
            cached_file_path=self._cache_path / pdf_file.with_suffix(pdf_file.suffix + suffix).name,
        )
        CachedFile._cache_index[pdf_file] = cache_index
        CachedFile.cleanup_cache()
        CachedFile.update_cache_file()
        return self.get_cached_file_path(pdf_file)

    @staticmethod
    def cleanup_cache():
        if len(CachedFile._cache_index) > CachedFile.max_cache_size:
            oldest = sorted(CachedFile._cache_index.items(), key=lambda x: x[1]["timestamp"])[0]
            del_cache_index = CachedFile._cache_index.pop(oldest[0])
            del_cache_index["cached_file_path"].unlink(missing_ok=True)

    @staticmethod
    def clear_index():
        CachedFile._cache_index.clear()

    @staticmethod
    def cleanup():
        if CachedFile._cache_dir:
            shutil.rmtree(CachedFile._cache_dir)
            CachedFile._cache_dir = get_persistent_temp_directory()