From 4b7abb6b6a0b24795d7aeb469f3aeca0f85fbda2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 07:36:33 -0600 Subject: [PATCH 1/9] pp_pack.c: White-space, add braces only Change a few lines that affect the next commits that use a coding style not conforming to current expectations And also add some blank lines to serve as "paragraph" markers between code segments, and change the indentation of a few lines in preparation for blocks being added and removed in the next commits. --- pp_pack.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 0b53611c7bb1..508d2a26b2e7 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -287,22 +287,25 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t if (UNLIKELY(needs_swap)) buf += buf_len; - for (;buf_len > 0; buf_len--) { + for (; buf_len > 0; buf_len--) { if (from >= end) return FALSE; val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); if (retlen == (STRLEN) -1) { from += UTF8_SAFE_SKIP(from, end); bad |= 1; } else from += retlen; - if (val >= 0x100) { - bad |= 2; - val = (U8) val; - } + + if (val >= 0x100) { + bad |= 2; + val = (U8) val; + } + if (UNLIKELY(needs_swap)) *(U8 *)--buf = (U8)val; else *(U8 *)buf++ = (U8)val; } + /* We have enough characters for the buffer. Did we have problems ? */ if (bad) { if (bad & 1) { @@ -315,13 +318,15 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t } if (from > end) from = end; } - if ((bad & 2)) - ck_warner(packWARN(datumtype & TYPE_IS_PACK ? - WARN_PACK : WARN_UNPACK), - "Character(s) in '%c' format wrapped in %s", - (int) TYPE_NO_MODIFIERS(datumtype), - datumtype & TYPE_IS_PACK ? "pack" : "unpack"); + + if ((bad & 2)) + ck_warner(packWARN(datumtype & TYPE_IS_PACK ? + WARN_PACK : WARN_UNPACK), + "Character(s) in '%c' format wrapped in %s", + (int) TYPE_NO_MODIFIERS(datumtype), + datumtype & TYPE_IS_PACK ? "pack" : "unpack"); } + *s = from; return TRUE; } @@ -1230,17 +1235,17 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c case 'c': while (len-- > 0 && s < strend) { int aint; - if (utf8) - { + if (utf8) { STRLEN retlen; aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); if (retlen == (STRLEN) -1) croak("Malformed UTF-8 string in unpack"); s += retlen; - } - else - aint = *(U8 *)(s)++; + } + else { + aint = *(U8 *)(s)++; + } if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ aint -= 256; if (!checksum) @@ -1310,11 +1315,16 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c break; len = UTF8SKIP(result); if (!S_utf8_to_bytes(aTHX_ &ptr, strend, - (char *) &result[1], len-1, 'U')) break; + (char *) &result[1], len - 1, 'U')) + { + break; + } + auv = utf8n_to_uvchr(result, len, &retlen, UTF8_ALLOW_DEFAULT); s = ptr; - } else { + } + else { auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen, UTF8_ALLOW_DEFAULT); if (retlen == (STRLEN) -1) From 9c9fde4a524f52eed254fb8f54269a6457387481 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Mar 2025 15:36:46 -0700 Subject: [PATCH 2/9] pp_pack: NEXT_UNI_VAL: Use 'uv' form, not 'uvchr' This converts a call to utf8n_to_uvchr() to instead use utf8_to_uv_flags(). This is part of the process of replacing all the '_uvchr' functions with '_uv' functions, which are now preferred. See perlapi. There is a subtle change of behavior here when the string to convert is malformed. I believe this is a bug fix. The macro NEXT_UNI_VAL assumed the failure return of utf8n_to_uvchr() is 'retlen' being -1. However, that value is only ever returned if the flags passed to the function include UTF8_CHECK_ONLY. Inspection of the code showed that this macro is never called with that flag; so this never currently fails. I looked at Perl v5.8.9, and the situation was the same. I doubt it has changed until now. This commit changes to use utf8_to_uv_flags(), and its failure return is properly checked for. The upshot is that this could now fail; whereas it couldn't before, even though the clear intent of the code has been to fail upon error. --- pp_pack.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 508d2a26b2e7..2e80d8057e0f 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -413,16 +413,16 @@ STMT_START { \ } STMT_END /* Only to be used inside a loop (see the break) */ -#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ -STMT_START { \ - STRLEN retlen; \ - if (str >= end) break; \ - val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ - if (retlen == (STRLEN) -1) { \ - *cur = '\0'; \ - croak("Malformed UTF-8 string in pack"); \ - } \ - str += retlen; \ +#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ +STMT_START { \ + STRLEN retlen; \ + if (str >= end) break; \ + if (! utf8_to_uv_flags((U8 *) str, (U8 *) end, &val, &retlen, \ + utf8_flags)) { \ + *cur = '\0'; \ + croak("Malformed UTF-8 string in pack"); \ + } \ + str += retlen; \ } STMT_END static const char *_action( const tempsym_t* symptr ) From cdeb0c8ab14a85b8b7c9df5e9400b4dc6bb59e5a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 10 Aug 2025 10:46:19 -0600 Subject: [PATCH 3/9] pp_pack: utf8_to_byte: Use 'uv' form, not 'uvchr' Like the previous commit, this converts a call to utf8n_to_uvchr() to instead use utf8_to_uv_flags(). And like that commit, the previous code thought it was checking for failure and croaking when it occurred, but was checking the wrong way so that it never could fail. And now it does a proper check and can fail. --- pp_pack.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 2e80d8057e0f..6243424434af 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -253,12 +253,14 @@ utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) if (*s >= end) { goto croak; } - val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1) + if (! utf8_to_uv_flags((U8 *) *s, (U8 *) end, &val, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)) + { croak: croak("Malformed UTF-8 string in '%c' format in unpack", (int) TYPE_NO_MODIFIERS(datumtype)); + } + if (val >= 0x100) { ck_warner(packWARN(WARN_UNPACK), "Character in '%c' format wrapped in unpack", From f18e772dccb58e24d29273d58be8b872ef274c45 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 07:50:23 -0600 Subject: [PATCH 4/9] pp_pack: unpack: Use 'uv' form, not 'uvchr' The new form croaks under failure; the previous didn't. But to get here the previous paragraph of code had to succeed, which means this should too. The flag in the changed call is the default for utf8_to_uv_or_die(), so it can be removed. --- pp_pack.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 6243424434af..b37dc2b2b280 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -1322,8 +1322,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c break; } - auv = utf8n_to_uvchr(result, len, &retlen, - UTF8_ALLOW_DEFAULT); + auv = utf8_to_uv_or_die(result, result + len, &retlen); s = ptr; } else { From b4ff6083661d2574fb305632e2acd6300cb649e8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 16:31:44 -0600 Subject: [PATCH 5/9] pp_pack: unpack: Use 'uv' form, not 'uvchr' This converts a call to utf8n_to_uvchr(), using instead utf8_to_uv(). The flag parameter UTF8_ALLOW_DEFAULT before this commit is the default for utf8_to_uv(), so it can be omitted. Like previous commits, the previous code thought it was checking for failure and croaking, but was checking the wrong way so that it never could fail. In the new commit, I had to remove that checking, because it caused test suite failures. --- pp_pack.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index b37dc2b2b280..6619b2ffb63a 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -1326,10 +1326,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c s = ptr; } else { - auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen, - UTF8_ALLOW_DEFAULT); - if (retlen == (STRLEN) -1) - croak("Malformed UTF-8 string in unpack"); + (void) utf8_to_uv((U8 *) s, (U8 *) strend, &auv, &retlen); s += retlen; } if (!checksum) From 9d00e3710091d0335577011b9dc557838d01c39b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 16:31:23 -0600 Subject: [PATCH 6/9] pp_pack: unpack: Use 'uv' form, not 'uvchr' Like previous commits, this converts a call to utf8n_to_uvchr() to use instead utf8_to_uv_flags(). And like previous commits, the previous code thought it was checking for failure and croaking, but was checking the wrong way so that it never could fail. With the new commit, it can fail. Also, the previous commit stored the result in an 'int', whose range can be significantly less than the code point returned. The value was truncated without notice. Now, I croak if the value got truncated. I wonder what I should do, as this is for unpacking a 'c' format, which is supposed to be a signed 8-bit value. --- pp_pack.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 6619b2ffb63a..f6dde13d8ed1 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -1239,10 +1239,21 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c int aint; if (utf8) { STRLEN retlen; - aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1) + UV auv; + if (! utf8_to_uv_flags((U8 *) s, (U8 *) strend, + &auv, &retlen, + (ckWARN(WARN_UTF8)) + ? 0 + : UTF8_ALLOW_ANY)) + { + croak("Malformed UTF-8 string in unpack"); + } + + aint = auv; + if ( (UV) aint != auv) { croak("Malformed UTF-8 string in unpack"); + } + s += retlen; } else { From 55c9e56b97b2261c35fab9eb901c176c5dd06d7d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 14:54:06 -0600 Subject: [PATCH 7/9] pp_pack: utf8_to_bytes: Use 'uv' form, not 'uvchr' Like previous commits, this converts a call to utf8n_to_uvchr() to instead use utf8_to_uv_flags(). Unlike those commits, the previous code did in fact properly check for failure. But upon failure, it semi-unsafely assumed the start byte was accurate as to the intended byte length of the input character. (It did check that the returned value did not go past the end of the buffer, by using UTF8_SAFE_SKIP() ). The new function call always returns the correct number of bytes, so just use that instead of trying to recompute it. The mechanism for deteriming failure no longer depends on the CHECK_ONLY flag, so that is removed. --- pp_pack.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index f6dde13d8ed1..a70e0937a270 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -282,8 +282,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t STRLEN retlen; const char *from = *s; int bad = 0; - const U32 flags = ckWARN(WARN_UTF8) ? - UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); + const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; const bool needs_swap = NEEDS_SWAP(datumtype); if (UNLIKELY(needs_swap)) @@ -291,17 +290,18 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t for (; buf_len > 0; buf_len--) { if (from >= end) return FALSE; - val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); - if (retlen == (STRLEN) -1) { - from += UTF8_SAFE_SKIP(from, end); + if (! utf8_to_uv_flags((U8 *) from, (U8 *) end, &val, &retlen, flags)) + { bad |= 1; - } else from += retlen; + } if (val >= 0x100) { bad |= 2; val = (U8) val; } + from += retlen; + if (UNLIKELY(needs_swap)) *(U8 *)--buf = (U8)val; else From 90bf29ff8c0f74de78ea55fff859fe89ab934fd6 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 15:05:03 -0600 Subject: [PATCH 8/9] pp_pack.c: S_utf8_to_bytes: Rmv loop Prior to this commit, the code went through the buffer with warnings turned off, but saving the fact if there were problems. If so, it ran through the buffer again, with warnings turned on to display those problems. I'm unsure of why it didn't output any warnings until it got to the end. But utf8_to_uv_msgs() allows us to store up the messages and output them later, without having to reparse --- pp_pack.c | 51 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index a70e0937a270..ce49773b7f91 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -288,20 +288,32 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t if (UNLIKELY(needs_swap)) buf += buf_len; + AV * msgs = NULL; for (; buf_len > 0; buf_len--) { if (from >= end) return FALSE; - if (! utf8_to_uv_flags((U8 *) from, (U8 *) end, &val, &retlen, flags)) - { - bad |= 1; - } + AV * this_msgs = NULL; + if (utf8_to_uv_msgs((U8 *) from, (U8 *) end, &val, &retlen, flags, + NULL, &this_msgs)) + { if (val >= 0x100) { bad |= 2; val = (U8) val; } + } from += retlen; + /* Add any messages from this conversion to the list for later output + * */ + if (this_msgs) { + while (av_count(this_msgs) > 0) { + av_push(msgs, av_shift(this_msgs)); + } + + Safefree(this_msgs); + } + if (UNLIKELY(needs_swap)) *(U8 *)--buf = (U8)val; else @@ -309,25 +321,34 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t } /* We have enough characters for the buffer. Did we have problems ? */ - if (bad) { - if (bad & 1) { - /* Rewalk the string fragment while warning */ - const char *ptr; - const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; - for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { - if (ptr >= end) break; - utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags); - } - if (from > end) from = end; + if (msgs) { + while (av_count(msgs) > 0) { + HV * msg_hash = (HV *) av_shift(msgs); + SV ** packed_categories_p = hv_fetchs(msg_hash, "warn_categories", 0); + if (packed_categories_p == NULL) { + continue; + } + + UV packed_categories = SvUV(*packed_categories_p); + if (packed_categories == 0) { + continue; + } + + SV ** warn_text_p = hv_fetchs(msg_hash, "text", 0); + if (warn_text_p) { + warner(packed_categories, "%s", SvPV_nolen(*warn_text_p)); + } } + Safefree(msgs); + } + if ((bad & 2)) ck_warner(packWARN(datumtype & TYPE_IS_PACK ? WARN_PACK : WARN_UNPACK), "Character(s) in '%c' format wrapped in %s", (int) TYPE_NO_MODIFIERS(datumtype), datumtype & TYPE_IS_PACK ? "pack" : "unpack"); - } *s = from; return TRUE; From 6556ff3788d40866966dd4774c785f6ef0afe085 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 11 Aug 2025 15:15:56 -0600 Subject: [PATCH 9/9] pp_pack.c: Convert variable to bool The reason for it to be more than this was removed in the previous commit --- pp_pack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index ce49773b7f91..465d302ff982 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -281,7 +281,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t UV val; STRLEN retlen; const char *from = *s; - int bad = 0; + bool bad = false; const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; const bool needs_swap = NEEDS_SWAP(datumtype); @@ -297,7 +297,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t NULL, &this_msgs)) { if (val >= 0x100) { - bad |= 2; + bad = true; val = (U8) val; } } @@ -343,12 +343,13 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t Safefree(msgs); } - if ((bad & 2)) + if (bad) { ck_warner(packWARN(datumtype & TYPE_IS_PACK ? WARN_PACK : WARN_UNPACK), "Character(s) in '%c' format wrapped in %s", (int) TYPE_NO_MODIFIERS(datumtype), datumtype & TYPE_IS_PACK ? "pack" : "unpack"); + } *s = from; return TRUE;