-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Open
Labels
PdfWriterThe PdfWriter component is affectedThe PdfWriter component is affectedis-bugFrom a users perspective, this is a bug - a violation of the expected behavior with a compliant PDFFrom a users perspective, this is a bug - a violation of the expected behavior with a compliant PDF
Description
Looking at the code for PdfWriter.compress_identical_objects(remove_orphans=False)
at
Lines 1612 to 1685 in 1c9eacd
def compress_identical_objects( | |
self, | |
remove_identicals: bool = True, | |
remove_orphans: bool = True, | |
) -> None: | |
""" | |
Parse the PDF file and merge objects that have the same hash. | |
This will make objects common to multiple pages. | |
Recommended to be used just before writing output. | |
Args: | |
remove_identicals: Remove identical objects. | |
remove_orphans: Remove unreferenced objects. | |
""" | |
def replace_in_obj( | |
obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] | |
) -> None: | |
if isinstance(obj, DictionaryObject): | |
key_val = obj.items() | |
elif isinstance(obj, ArrayObject): | |
key_val = enumerate(obj) # type: ignore | |
else: | |
return | |
assert isinstance(obj, (DictionaryObject, ArrayObject)) | |
for k, v in key_val: | |
if isinstance(v, IndirectObject): | |
orphans[v.idnum - 1] = False | |
if v in crossref: | |
obj[k] = crossref[v] | |
else: | |
"""the filtering on DictionaryObject and ArrayObject only | |
will be performed within replace_in_obj""" | |
replace_in_obj(v, crossref) | |
# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) | |
self._idnum_hash = {} | |
orphans = [True] * len(self._objects) | |
# look for similar objects | |
for idx, obj in enumerate(self._objects): | |
if is_null_or_none(obj): | |
continue | |
assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here. | |
assert isinstance(obj.indirect_reference, IndirectObject) | |
h = obj.hash_value() | |
if remove_identicals and h in self._idnum_hash: | |
self._idnum_hash[h][1].append(obj.indirect_reference) | |
self._objects[idx] = None | |
else: | |
self._idnum_hash[h] = (obj.indirect_reference, []) | |
# generate the dict converting others to 1st | |
cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0} | |
cnv_rev: Dict[IndirectObject, IndirectObject] = {} | |
for k, v in cnv.items(): | |
cnv_rev.update(zip(v, (k,) * len(v))) | |
# replace reference to merged objects | |
for obj in self._objects: | |
if isinstance(obj, (DictionaryObject, ArrayObject)): | |
replace_in_obj(obj, cnv_rev) | |
# remove orphans (if applicable) | |
orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore | |
orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore | |
try: | |
orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore | |
except AttributeError: | |
pass | |
for i in compress(range(len(self._objects)), orphans): | |
self._objects[i] = None |
remove_orphans
is never evaluated and always assumed to be True
.Metadata
Metadata
Assignees
Labels
PdfWriterThe PdfWriter component is affectedThe PdfWriter component is affectedis-bugFrom a users perspective, this is a bug - a violation of the expected behavior with a compliant PDFFrom a users perspective, this is a bug - a violation of the expected behavior with a compliant PDF