Skip to content

PdfWriter.compress_identical_objects ignores remove_orphans #3306

@stefan6419846

Description

@stefan6419846

Looking at the code for PdfWriter.compress_identical_objects(remove_orphans=False) at

pypdf/pypdf/_writer.py

Lines 1612 to 1685 in 1c9eacd

def compress_identical_objects(
self,
remove_identicals: bool = True,
remove_orphans: bool = True,
) -> None:
"""
Parse the PDF file and merge objects that have the same hash.
This will make objects common to multiple pages.
Recommended to be used just before writing output.
Args:
remove_identicals: Remove identical objects.
remove_orphans: Remove unreferenced objects.
"""
def replace_in_obj(
obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
) -> None:
if isinstance(obj, DictionaryObject):
key_val = obj.items()
elif isinstance(obj, ArrayObject):
key_val = enumerate(obj) # type: ignore
else:
return
assert isinstance(obj, (DictionaryObject, ArrayObject))
for k, v in key_val:
if isinstance(v, IndirectObject):
orphans[v.idnum - 1] = False
if v in crossref:
obj[k] = crossref[v]
else:
"""the filtering on DictionaryObject and ArrayObject only
will be performed within replace_in_obj"""
replace_in_obj(v, crossref)
# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
self._idnum_hash = {}
orphans = [True] * len(self._objects)
# look for similar objects
for idx, obj in enumerate(self._objects):
if is_null_or_none(obj):
continue
assert obj is not None, "mypy" # mypy: TypeGuard of `is_null_or_none` does not help here.
assert isinstance(obj.indirect_reference, IndirectObject)
h = obj.hash_value()
if remove_identicals and h in self._idnum_hash:
self._idnum_hash[h][1].append(obj.indirect_reference)
self._objects[idx] = None
else:
self._idnum_hash[h] = (obj.indirect_reference, [])
# generate the dict converting others to 1st
cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
cnv_rev: Dict[IndirectObject, IndirectObject] = {}
for k, v in cnv.items():
cnv_rev.update(zip(v, (k,) * len(v)))
# replace reference to merged objects
for obj in self._objects:
if isinstance(obj, (DictionaryObject, ArrayObject)):
replace_in_obj(obj, cnv_rev)
# remove orphans (if applicable)
orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore
try:
orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
except AttributeError:
pass
for i in compress(range(len(self._objects)), orphans):
self._objects[i] = None
it seems like remove_orphans is never evaluated and always assumed to be True.

Metadata

Metadata

Assignees

No one assigned

    Labels

    PdfWriterThe PdfWriter component is affectedis-bugFrom a users perspective, this is a bug - a violation of the expected behavior with a compliant PDF

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions