Since adobe is a paid service, we cache all the outputs of a given pdf file that adobe provides to avoid calling the api
CachedFile
A singleton Cache mechanism that caches a given pdf file.
The cached index contains the mapping of the original pdf file path to the processed adobe zip file.
Source code in supermat/core/parser/adobe_parser/_adobe_doc_cacher.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131 | class CachedFile:
"""
A singleton Cache mechanism that caches a given pdf file.
The cached index contains the mapping of the original pdf file path to the processed adobe zip file.
"""
_instance: Self | None = None
_lock = threading.Lock()
_cache_dir: Path | None = None
_cache_index: dict[Path, CacheIndex] = {}
_cache_index_filename = "__cache_index.json"
max_cache_size = 100
def __new__(cls):
CachedFile._setup_tmp_dir()
if not cls._instance:
with cls._lock:
if not cls._instance:
cls._instance = super().__new__(cls)
return cls._instance
@staticmethod
def _setup_tmp_dir():
if CachedFile._cache_dir is None:
CachedFile._cache_dir = get_persistent_temp_directory()
CachedFile.reload_cache_index()
def __init__(self):
assert CachedFile._cache_dir is not None
self._cache_path = CachedFile._cache_dir
def exists(self, pdf_file: Path) -> bool:
return pdf_file in CachedFile._cache_index
def get_cached_file_path(self, pdf_file: Path) -> Path:
return CachedFile._cache_index[pdf_file]["cached_file_path"]
@staticmethod
def get_cache_index_path() -> Path | None:
if CachedFile._cache_dir is None:
return None
return Path(CachedFile._cache_dir) / CachedFile._cache_index_filename
@staticmethod
def update_cache_file():
cache_index_path = CachedFile.get_cache_index_path()
if cache_index_path is None:
return
with cache_index_path.open("wb+") as fp:
fp.write(
orjson.dumps({k.as_posix(): v for k, v in CachedFile._cache_index.items()}, default=orjson_defaults)
)
@staticmethod
def reload_cache_index():
cache_index_path = CachedFile.get_cache_index_path()
if cache_index_path is not None and cache_index_path.exists():
with cache_index_path.open("rb") as fp:
CachedFile._cache_index = {
Path(path): CacheIndex(**cache_index) for path, cache_index in orjson.loads(fp.read()).items()
}
def create_file(self, pdf_file: Path, suffix: str = ".zip") -> Path:
cache_index = CacheIndex(
timestamp=datetime.now().timestamp(),
cached_file_path=self._cache_path / pdf_file.with_suffix(pdf_file.suffix + suffix).name,
)
CachedFile._cache_index[pdf_file] = cache_index
CachedFile.cleanup_cache()
CachedFile.update_cache_file()
return self.get_cached_file_path(pdf_file)
@staticmethod
def cleanup_cache():
if len(CachedFile._cache_index) > CachedFile.max_cache_size:
oldest = sorted(CachedFile._cache_index.items(), key=lambda x: x[1]["timestamp"])[0]
del_cache_index = CachedFile._cache_index.pop(oldest[0])
del_cache_index["cached_file_path"].unlink(missing_ok=True)
@staticmethod
def clear_index():
CachedFile._cache_index.clear()
@staticmethod
def cleanup():
if CachedFile._cache_dir:
shutil.rmtree(CachedFile._cache_dir)
CachedFile._cache_dir = get_persistent_temp_directory()
|