diff --git a/Doc/library/marshal.rst b/Doc/library/marshal.rst index d65afc20041133..18fff2a29b482b 100644 --- a/Doc/library/marshal.rst +++ b/Doc/library/marshal.rst @@ -54,7 +54,7 @@ bytes-like objects. The module defines these functions: -.. function:: dump(value, file[, version]) +.. function:: dump(value, file[, version [, stable]]) Write the value on the open file. The value must be a supported type. The file must be a writeable :term:`binary file`. @@ -66,6 +66,13 @@ The module defines these functions: The *version* argument indicates the data format that ``dump`` should use (see below). + The *stable* argument makes generated data more stable as possible. + It guarantees ``dump(value1, 4, True) == dump(value2, 4, True)`` + for ``value1 is value2``, but not for ``value1 == value2``. + + .. versionadded:: 3.8 + *stable* option is added. + .. function:: load(file) @@ -80,7 +87,7 @@ The module defines these functions: :func:`load` will substitute ``None`` for the unmarshallable type. -.. function:: dumps(value[, version]) +.. function:: dumps(value[, version [, stable]]) Return the bytes object that would be written to a file by ``dump(value, file)``. The value must be a supported type. Raise a :exc:`ValueError` exception if value @@ -89,6 +96,13 @@ The module defines these functions: The *version* argument indicates the data format that ``dumps`` should use (see below). + The *stable* argument makes generated data more stable as possible. + It guarantees ``dump(value1, 4, True) == dump(value2, 4, True)`` + for ``value1 is value2``, but not for ``value1 == value2``. + + .. versionadded:: 3.8 + *stable* option is added. + .. function:: loads(bytes) diff --git a/Lib/test/test_marshal.py b/Lib/test/test_marshal.py index a3bd350c77b95b..0c17f282caea7c 100644 --- a/Lib/test/test_marshal.py +++ b/Lib/test/test_marshal.py @@ -324,6 +324,25 @@ def test_eof(self): for i in range(len(data)): self.assertRaises(EOFError, marshal.loads, data[0: i]) + def test_stable_refs(self): + """FLAG_REF must be used regardless refcnt""" + x = 0x42 + y = (x,) + z = [y, y] + dummy = x # refcnt of x must be >1 + + # x is used once, FLAG_REF must not be set. + data = marshal.dumps(x, 4, True) + self.assertEqual(b"i\x42\x00\x00\x00", data) + + data = marshal.dumps(z, 4, True) + # y is used twice, but x is used once because y is reused. + self.assertEqual(b"[\x02\x00\x00\x00" + # list(size=2)i\x42\x00\x00\x00", data) + b"\xa9\x01" + # small tuple(size=1) | FLAG_REF + b"i\x42\x00\x00\x00" + # int(42) + b"r\x00\x00\x00\x00", # ref(0) + data) + LARGE_SIZE = 2**31 pointer_size = 8 if sys.maxsize > 0xFFFFFFFF else 4 diff --git a/Python/clinic/marshal.c.h b/Python/clinic/marshal.c.h index 1ae9332fd3f772..7508fac87813ed 100644 --- a/Python/clinic/marshal.c.h +++ b/Python/clinic/marshal.c.h @@ -3,7 +3,7 @@ preserve [clinic start generated code]*/ PyDoc_STRVAR(marshal_dump__doc__, -"dump($module, value, file, version=version, /)\n" +"dump($module, value, file, version=version, stable=False, /)\n" "--\n" "\n" "Write the value on the open file.\n" @@ -14,6 +14,8 @@ PyDoc_STRVAR(marshal_dump__doc__, " Must be a writeable binary file.\n" " version\n" " Indicates the data format that dump should use.\n" +" stable\n" +" Generate stable output as possible.\n" "\n" "If the value has (or contains an object that has) an unsupported type, a\n" "ValueError exception is raised - but garbage data will also be written\n" @@ -24,7 +26,7 @@ PyDoc_STRVAR(marshal_dump__doc__, static PyObject * marshal_dump_impl(PyObject *module, PyObject *value, PyObject *file, - int version); + int version, int stable); static PyObject * marshal_dump(PyObject *module, PyObject *const *args, Py_ssize_t nargs) @@ -33,12 +35,13 @@ marshal_dump(PyObject *module, PyObject *const *args, Py_ssize_t nargs) PyObject *value; PyObject *file; int version = Py_MARSHAL_VERSION; + int stable = 0; - if (!_PyArg_ParseStack(args, nargs, "OO|i:dump", - &value, &file, &version)) { + if (!_PyArg_ParseStack(args, nargs, "OO|ip:dump", + &value, &file, &version, &stable)) { goto exit; } - return_value = marshal_dump_impl(module, value, file, version); + return_value = marshal_dump_impl(module, value, file, version, stable); exit: return return_value; @@ -64,7 +67,7 @@ PyDoc_STRVAR(marshal_load__doc__, {"load", (PyCFunction)marshal_load, METH_O, marshal_load__doc__}, PyDoc_STRVAR(marshal_dumps__doc__, -"dumps($module, value, version=version, /)\n" +"dumps($module, value, version=version, stable=False, /)\n" "--\n" "\n" "Return the bytes object that would be written to a file by dump(value, file).\n" @@ -73,6 +76,8 @@ PyDoc_STRVAR(marshal_dumps__doc__, " Must be a supported type.\n" " version\n" " Indicates the data format that dumps should use.\n" +" stable\n" +" Generate stable output as possible.\n" "\n" "Raise a ValueError exception if value has (or contains an object that has) an\n" "unsupported type."); @@ -81,7 +86,8 @@ PyDoc_STRVAR(marshal_dumps__doc__, {"dumps", (PyCFunction)marshal_dumps, METH_FASTCALL, marshal_dumps__doc__}, static PyObject * -marshal_dumps_impl(PyObject *module, PyObject *value, int version); +marshal_dumps_impl(PyObject *module, PyObject *value, int version, + int stable); static PyObject * marshal_dumps(PyObject *module, PyObject *const *args, Py_ssize_t nargs) @@ -89,12 +95,13 @@ marshal_dumps(PyObject *module, PyObject *const *args, Py_ssize_t nargs) PyObject *return_value = NULL; PyObject *value; int version = Py_MARSHAL_VERSION; + int stable = 0; - if (!_PyArg_ParseStack(args, nargs, "O|i:dumps", - &value, &version)) { + if (!_PyArg_ParseStack(args, nargs, "O|ip:dumps", + &value, &version, &stable)) { goto exit; } - return_value = marshal_dumps_impl(module, value, version); + return_value = marshal_dumps_impl(module, value, version, stable); exit: return return_value; @@ -134,4 +141,4 @@ marshal_loads(PyObject *module, PyObject *arg) return return_value; } -/*[clinic end generated code: output=584eb2222d86fdc3 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=eece7b65d6cb500a input=a9049054013a1b77]*/ diff --git a/Python/marshal.c b/Python/marshal.c index 6d06266c6a8e2e..ea72fde6ec00ae 100644 --- a/Python/marshal.c +++ b/Python/marshal.c @@ -86,7 +86,9 @@ typedef struct { char *end; char *buf; _Py_hashtable_t *hashtable; + int last_index; int version; + int stable; } WFILE; #define w_byte(c, p) do { \ @@ -276,37 +278,67 @@ w_ref(PyObject *v, char *flag, WFILE *p) return 0; /* not writing object references */ /* if it has only one reference, it definitely isn't shared */ - if (Py_REFCNT(v) == 1) + if (Py_REFCNT(v) == 1) { return 0; + } entry = _Py_HASHTABLE_GET_ENTRY(p->hashtable, v); - if (entry != NULL) { - /* write the reference index to the stream */ + + if (p->stable) { + if (entry == NULL) { + return 0; + } + _Py_HASHTABLE_ENTRY_READ_DATA(p->hashtable, entry, w); - /* we don't store "long" indices in the dict */ - assert(0 <= w && w <= 0x7fffffff); - w_byte(TYPE_REF, p); - w_long(w, p); - return 1; - } else { - size_t s = p->hashtable->entries; - /* we don't support long indices */ - if (s >= 0x7fffffff) { - PyErr_SetString(PyExc_ValueError, "too many objects"); - goto err; + // w >= 0: index written by previous w_ref() + // w < 0 : refcnt counted by w_count_refs() + if (w == -1) { + // This object is used only once. + return 0; } - w = (int)s; - Py_INCREF(v); - if (_Py_HASHTABLE_SET(p->hashtable, v, w) < 0) { - Py_DECREF(v); - goto err; + + if (w >= 0) { + /* we don't store "long" indices in the dict */ + assert(0 <= w && w <= 0x7fffffff); + w_byte(TYPE_REF, p); + w_long(w, p); + return 1; + } else { + w = p->last_index++; + _Py_HASHTABLE_ENTRY_WRITE_DATA(p->hashtable, entry, w); + *flag |= FLAG_REF; + return 0; } - *flag |= FLAG_REF; - return 0; } + else { + if (entry != NULL) { + /* write the reference index to the stream */ + _Py_HASHTABLE_ENTRY_READ_DATA(p->hashtable, entry, w); + /* we don't store "long" indices in the dict */ + assert(0 <= w && w <= 0x7fffffff); + w_byte(TYPE_REF, p); + w_long(w, p); + return 1; + } else { + size_t s = p->hashtable->entries; + /* we don't support long indices */ + if (s >= 0x7fffffff) { + PyErr_SetString(PyExc_ValueError, "too many objects"); + goto err; + } + w = (int)s; + Py_INCREF(v); + if (_Py_HASHTABLE_SET(p->hashtable, v, w) < 0) { + Py_DECREF(v); + goto err; + } + *flag |= FLAG_REF; + return 0; + } err: - p->error = WFERR_UNMARSHALLABLE; - return 1; + p->error = WFERR_UNMARSHALLABLE; + return 1; + } } static void @@ -584,17 +616,137 @@ w_complex_object(PyObject *v, char flag, WFILE *p) } static int -w_init_refs(WFILE *wf, int version) +w_count_refs(PyObject *v, WFILE *p) { - if (version >= 3) { - wf->hashtable = _Py_hashtable_new(sizeof(PyObject *), sizeof(int), - _Py_hashtable_hash_ptr, - _Py_hashtable_compare_direct); - if (wf->hashtable == NULL) { - PyErr_NoMemory(); - return -1; + if (p->depth > MAX_MARSHAL_STACK_DEPTH) { + PyErr_SetString(PyExc_ValueError, + "object too deeply nested to marshal"); + goto err; + } + + if (v == NULL || + v == Py_None || + v == PyExc_StopIteration || + v == Py_Ellipsis || + v == Py_False || + v == Py_True) { + return 0; + } + + /* if it has only one reference, it definitely isn't shared */ + if (Py_REFCNT(v) > 1) { + // Use negative number to count refs + _Py_hashtable_entry_t *entry = _Py_HASHTABLE_GET_ENTRY(p->hashtable, v); + if (entry != NULL) { + int w; + _Py_HASHTABLE_ENTRY_READ_DATA(p->hashtable, entry, w); + assert(w < 0); + w--; + _Py_HASHTABLE_ENTRY_WRITE_DATA(p->hashtable, entry, w); + return 0; + } + else { + size_t s = p->hashtable->entries; + /* we don't support long indices */ + if (s >= 0x7fffffff) { + PyErr_SetString(PyExc_ValueError, "too many objects"); + goto err; + } + int w = -1; + Py_INCREF(v); + if (_Py_HASHTABLE_SET(p->hashtable, v, w) < 0) { + Py_DECREF(v); + goto err; + } + } + } + + // These logic should be same to w_object() + p->depth++; + + Py_ssize_t i, n; + if (PyTuple_CheckExact(v)) { + n = PyTuple_Size(v); + for (i = 0; i < n; i++) { + w_count_refs(PyTuple_GET_ITEM(v, i), p); + } + } + else if (PyList_CheckExact(v)) { + n = PyList_GET_SIZE(v); + for (i = 0; i < n; i++) { + w_count_refs(PyList_GET_ITEM(v, i), p); + } + } + else if (PyDict_CheckExact(v)) { + PyObject *key, *value; + i = 0; + while (PyDict_Next(v, &i, &key, &value)) { + w_count_refs(key, p); + w_count_refs(value, p); } } + else if (PyAnySet_CheckExact(v)) { + PyObject *value, *it; + + it = PyObject_GetIter(v); + if (it == NULL) { + p->depth--; + goto err; + } + while ((value = PyIter_Next(it)) != NULL) { + w_count_refs(value, p); + Py_DECREF(value); + } + Py_DECREF(it); + if (PyErr_Occurred()) { + p->depth--; + goto err; + } + } + else if (PyCode_Check(v)) { + PyCodeObject *co = (PyCodeObject *)v; + w_count_refs(co->co_code, p); + w_count_refs(co->co_consts, p); + w_count_refs(co->co_names, p); + w_count_refs(co->co_varnames, p); + w_count_refs(co->co_freevars, p); + w_count_refs(co->co_cellvars, p); + w_count_refs(co->co_filename, p); + w_count_refs(co->co_name, p); + w_count_refs(co->co_lnotab, p); + } + + p->depth--; + + if (p->error == WFERR_UNMARSHALLABLE) { + return 1; + } + return 0; + +err: + p->error = WFERR_UNMARSHALLABLE; + return 1; +} + +static int +w_init_refs(WFILE *wf, int version, PyObject *x) +{ + if (version < 3) { + return 0; + } + + wf->hashtable = _Py_hashtable_new(sizeof(PyObject *), sizeof(int), + _Py_hashtable_hash_ptr, + _Py_hashtable_compare_direct); + if (wf->hashtable == NULL) { + PyErr_NoMemory(); + return -1; + } + wf->last_index = 0; + + if (wf->stable) { + return w_count_refs(x, wf); + } return 0; } @@ -645,8 +797,9 @@ PyMarshal_WriteObjectToFile(PyObject *x, FILE *fp, int version) wf.end = wf.ptr + sizeof(buf); wf.error = WFERR_OK; wf.version = version; - if (w_init_refs(&wf, version)) + if (w_init_refs(&wf, version, x)) { return; /* caller mush check PyErr_Occurred() */ + } w_object(x, &wf); w_clear_refs(&wf); w_flush(&wf); @@ -1608,20 +1761,18 @@ PyMarshal_ReadObjectFromString(const char *str, Py_ssize_t len) return result; } -PyObject * -PyMarshal_WriteObjectToString(PyObject *x, int version) +static PyObject * +marshal_to_string(PyObject *x, int version, int stable) { - WFILE wf; + WFILE wf = {.stable=stable, .version=version}; - memset(&wf, 0, sizeof(wf)); wf.str = PyBytes_FromStringAndSize((char *)NULL, 50); if (wf.str == NULL) return NULL; wf.ptr = wf.buf = PyBytes_AS_STRING((PyBytesObject *)wf.str); wf.end = wf.ptr + PyBytes_Size(wf.str); wf.error = WFERR_OK; - wf.version = version; - if (w_init_refs(&wf, version)) { + if (w_init_refs(&wf, version, x)) { Py_DECREF(wf.str); return NULL; } @@ -1651,6 +1802,12 @@ PyMarshal_WriteObjectToString(PyObject *x, int version) return wf.str; } +PyObject * +PyMarshal_WriteObjectToString(PyObject *x, int version) +{ + return marshal_to_string(x, version, 0); +} + /* And an interface for Python programs... */ /*[clinic input] marshal.dump @@ -1661,6 +1818,8 @@ marshal.dump Must be a writeable binary file. version: int(c_default="Py_MARSHAL_VERSION") = version Indicates the data format that dump should use. + stable: bool = False + Generate stable output as possible. / Write the value on the open file. @@ -1672,15 +1831,15 @@ to the file. The object will not be properly read back by load(). static PyObject * marshal_dump_impl(PyObject *module, PyObject *value, PyObject *file, - int version) -/*[clinic end generated code: output=aaee62c7028a7cb2 input=6c7a3c23c6fef556]*/ + int version, int stable) +/*[clinic end generated code: output=b472bdb1b466baa1 input=89780da6b9530e4b]*/ { /* XXX Quick hack -- need to do this differently */ PyObject *s; PyObject *res; _Py_IDENTIFIER(write); - s = PyMarshal_WriteObjectToString(value, version); + s = marshal_to_string(value, version, stable); if (s == NULL) return NULL; res = _PyObject_CallMethodIdObjArgs(file, &PyId_write, s, NULL); @@ -1754,6 +1913,8 @@ marshal.dumps Must be a supported type. version: int(c_default="Py_MARSHAL_VERSION") = version Indicates the data format that dumps should use. + stable: bool = False + Generate stable output as possible. / Return the bytes object that would be written to a file by dump(value, file). @@ -1763,10 +1924,11 @@ unsupported type. [clinic start generated code]*/ static PyObject * -marshal_dumps_impl(PyObject *module, PyObject *value, int version) -/*[clinic end generated code: output=9c200f98d7256cad input=a2139ea8608e9b27]*/ +marshal_dumps_impl(PyObject *module, PyObject *value, int version, + int stable) +/*[clinic end generated code: output=87276039e6c75faf input=afce1546a470f153]*/ { - return PyMarshal_WriteObjectToString(value, version); + return marshal_to_string(value, version, stable); } /*[clinic input]