From 4b7abb6b6a0b24795d7aeb469f3aeca0f85fbda2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 07:36:33 -0600
Subject: [PATCH 1/9] pp_pack.c: White-space, add braces only

Change a few lines that affect the next commits that use a coding style
not conforming to current expectations

And also add some blank lines to serve as "paragraph" markers between code
segments, and change the indentation of a few lines in preparation for
blocks being added and removed in the next commits.
---
 pp_pack.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 0b53611c7bb1..508d2a26b2e7 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -287,22 +287,25 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
     if (UNLIKELY(needs_swap))
         buf += buf_len;
 
-    for (;buf_len > 0; buf_len--) {
+    for (; buf_len > 0; buf_len--) {
         if (from >= end) return FALSE;
         val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
         if (retlen == (STRLEN) -1) {
             from += UTF8_SAFE_SKIP(from, end);
             bad |= 1;
         } else from += retlen;
-        if (val >= 0x100) {
-            bad |= 2;
-            val = (U8) val;
-        }
+
+            if (val >= 0x100) {
+                bad |= 2;
+                val = (U8) val;
+            }
+
         if (UNLIKELY(needs_swap))
             *(U8 *)--buf = (U8)val;
         else
             *(U8 *)buf++ = (U8)val;
     }
+
     /* We have enough characters for the buffer. Did we have problems ? */
     if (bad) {
         if (bad & 1) {
@@ -315,13 +318,15 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
             }
             if (from > end) from = end;
         }
-        if ((bad & 2))
-            ck_warner(packWARN(datumtype & TYPE_IS_PACK ?
-                               WARN_PACK : WARN_UNPACK),
-                      "Character(s) in '%c' format wrapped in %s",
-                      (int) TYPE_NO_MODIFIERS(datumtype),
-                      datumtype & TYPE_IS_PACK ? "pack" : "unpack");
+
+    if ((bad & 2))
+        ck_warner(packWARN(datumtype & TYPE_IS_PACK ?
+                            WARN_PACK : WARN_UNPACK),
+                  "Character(s) in '%c' format wrapped in %s",
+                  (int) TYPE_NO_MODIFIERS(datumtype),
+                  datumtype & TYPE_IS_PACK ? "pack" : "unpack");
     }
+
     *s = from;
     return TRUE;
 }
@@ -1230,17 +1235,17 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
         case 'c':
             while (len-- > 0 && s < strend) {
                 int aint;
-                if (utf8)
-                  {
+                if (utf8) {
                     STRLEN retlen;
                     aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
                                  ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
                     if (retlen == (STRLEN) -1)
                         croak("Malformed UTF-8 string in unpack");
                     s += retlen;
-                  }
-                else
-                  aint = *(U8 *)(s)++;
+                }
+                else {
+                    aint = *(U8 *)(s)++;
+                }
                 if (aint >= 128 && datumtype != 'C')	/* fake up signed chars */
                     aint -= 256;
                 if (!checksum)
@@ -1310,11 +1315,16 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                         break;
                     len = UTF8SKIP(result);
                     if (!S_utf8_to_bytes(aTHX_ &ptr, strend,
-                                      (char *) &result[1], len-1, 'U')) break;
+                                         (char *) &result[1], len - 1, 'U'))
+                    {
+                        break;
+                    }
+
                     auv = utf8n_to_uvchr(result, len, &retlen,
                                          UTF8_ALLOW_DEFAULT);
                     s = ptr;
-                } else {
+                }
+                else {
                     auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen,
                                          UTF8_ALLOW_DEFAULT);
                     if (retlen == (STRLEN) -1)

From 9c9fde4a524f52eed254fb8f54269a6457387481 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Wed, 5 Mar 2025 15:36:46 -0700
Subject: [PATCH 2/9] pp_pack: NEXT_UNI_VAL: Use 'uv' form, not 'uvchr'

This converts a call to utf8n_to_uvchr() to instead use
utf8_to_uv_flags().

This is part of the process of replacing all the '_uvchr' functions with
'_uv' functions, which are now preferred.  See perlapi.

There is a subtle change of behavior here when the string to convert is
malformed.  I believe this is a bug fix.  The macro NEXT_UNI_VAL assumed
the failure return of utf8n_to_uvchr() is 'retlen' being -1.  However,
that value is only ever returned if the flags passed to the function
include UTF8_CHECK_ONLY.  Inspection of the code showed that this macro
is never called with that flag; so this never currently fails.

I looked at Perl v5.8.9, and the situation was the same.  I doubt it has
changed until now.

This commit changes to use utf8_to_uv_flags(), and its failure return is
properly checked for.  The upshot is that this could now fail; whereas
it couldn't before, even though the clear intent of the code has been to
fail upon error.
---
 pp_pack.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 508d2a26b2e7..2e80d8057e0f 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -413,16 +413,16 @@ STMT_START {					\
 } STMT_END
 
 /* Only to be used inside a loop (see the break) */
-#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags)		\
-STMT_START {							\
-    STRLEN retlen;						\
-    if (str >= end) break;					\
-    val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags);	\
-    if (retlen == (STRLEN) -1) {			        \
-        *cur = '\0';						\
-        croak("Malformed UTF-8 string in pack");	        \
-    }								\
-    str += retlen;						\
+#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags)		            \
+STMT_START {							            \
+    STRLEN retlen;						            \
+    if (str >= end) break;					            \
+    if (! utf8_to_uv_flags((U8 *) str, (U8 *) end, &val, &retlen,           \
+                            utf8_flags)) {                                  \
+        *cur = '\0';						            \
+        croak("Malformed UTF-8 string in pack");	                    \
+    }								            \
+    str += retlen;						            \
 } STMT_END
 
 static const char *_action( const tempsym_t* symptr )

From cdeb0c8ab14a85b8b7c9df5e9400b4dc6bb59e5a Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sun, 10 Aug 2025 10:46:19 -0600
Subject: [PATCH 3/9] pp_pack: utf8_to_byte: Use 'uv' form, not 'uvchr'

Like the previous commit, this converts a call to utf8n_to_uvchr() to
instead use utf8_to_uv_flags().

And like that commit, the previous code thought it was checking for
failure and croaking when it occurred, but was checking the wrong way so
that it never could fail.  And now it does a proper check and can fail.
---
 pp_pack.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 2e80d8057e0f..6243424434af 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -253,12 +253,14 @@ utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype)
     if (*s >= end) {
         goto croak;
     }
-    val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
-                         ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-    if (retlen == (STRLEN) -1)
+    if (! utf8_to_uv_flags((U8 *) *s, (U8 *) end, &val, &retlen,
+                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY))
+    {
       croak:
         croak("Malformed UTF-8 string in '%c' format in unpack",
                    (int) TYPE_NO_MODIFIERS(datumtype));
+    }
+
     if (val >= 0x100) {
         ck_warner(packWARN(WARN_UNPACK),
                   "Character in '%c' format wrapped in unpack",

From f18e772dccb58e24d29273d58be8b872ef274c45 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 07:50:23 -0600
Subject: [PATCH 4/9] pp_pack: unpack: Use 'uv' form, not 'uvchr'

The new form croaks under failure; the previous didn't.  But to get here
the previous paragraph of code had to succeed, which means this should
too.

The flag in the changed call is the default for utf8_to_uv_or_die(), so
it can be removed.
---
 pp_pack.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 6243424434af..b37dc2b2b280 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1322,8 +1322,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                         break;
                     }
 
-                    auv = utf8n_to_uvchr(result, len, &retlen,
-                                         UTF8_ALLOW_DEFAULT);
+                    auv = utf8_to_uv_or_die(result, result + len, &retlen);
                     s = ptr;
                 }
                 else {

From b4ff6083661d2574fb305632e2acd6300cb649e8 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 16:31:44 -0600
Subject: [PATCH 5/9] pp_pack: unpack: Use 'uv' form, not 'uvchr'

This converts a call to utf8n_to_uvchr(), using instead utf8_to_uv().
The flag parameter UTF8_ALLOW_DEFAULT before this commit is the default
for utf8_to_uv(), so it can be omitted.

Like previous commits, the previous code thought it was checking for
failure and croaking, but was checking the wrong way so that it never
could fail.  In the new commit, I had to remove that checking, because
it caused test suite failures.
---
 pp_pack.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index b37dc2b2b280..6619b2ffb63a 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1326,10 +1326,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                     s = ptr;
                 }
                 else {
-                    auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen,
-                                         UTF8_ALLOW_DEFAULT);
-                    if (retlen == (STRLEN) -1)
-                        croak("Malformed UTF-8 string in unpack");
+                    (void) utf8_to_uv((U8 *) s, (U8 *) strend, &auv, &retlen);
                     s += retlen;
                 }
                 if (!checksum)

From 9d00e3710091d0335577011b9dc557838d01c39b Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 16:31:23 -0600
Subject: [PATCH 6/9] pp_pack: unpack: Use 'uv' form, not 'uvchr'

Like previous commits, this converts a call to utf8n_to_uvchr() to use
instead utf8_to_uv_flags().

And like previous commits, the previous code thought it was checking for
failure and croaking, but was checking the wrong way so that it never
could fail.  With the new commit, it can fail.

Also, the previous commit stored the result in an 'int', whose range can
be significantly less than the code point returned.  The value was
truncated without notice.  Now, I croak if the value got truncated.

I wonder what I should do, as this is for unpacking a 'c' format, which
is supposed to be a signed 8-bit value.
---
 pp_pack.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index 6619b2ffb63a..f6dde13d8ed1 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1239,10 +1239,21 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                 int aint;
                 if (utf8) {
                     STRLEN retlen;
-                    aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
-                                 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-                    if (retlen == (STRLEN) -1)
+                    UV auv;
+                    if (! utf8_to_uv_flags((U8 *) s, (U8 *) strend,
+                                           &auv, &retlen,
+                                           (ckWARN(WARN_UTF8))
+                                            ? 0
+                                            : UTF8_ALLOW_ANY))
+                    {
+                        croak("Malformed UTF-8 string in unpack");
+                    }
+
+                    aint = auv;
+                    if ( (UV) aint != auv) {
                         croak("Malformed UTF-8 string in unpack");
+                    }
+
                     s += retlen;
                 }
                 else {

From 55c9e56b97b2261c35fab9eb901c176c5dd06d7d Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 14:54:06 -0600
Subject: [PATCH 7/9] pp_pack: utf8_to_bytes: Use 'uv' form, not 'uvchr'

Like previous commits, this converts a call to utf8n_to_uvchr() to
instead use utf8_to_uv_flags().

Unlike those commits, the previous code did in fact properly check for
failure.  But upon failure, it semi-unsafely assumed the start byte was
accurate as to the intended byte length of the input character.  (It did
check that the returned value did not go past the end of the buffer, by
using UTF8_SAFE_SKIP() ).  The new function call always returns the
correct number of bytes, so just use that instead of trying to recompute
it.

The mechanism for deteriming failure no longer depends on the CHECK_ONLY
flag, so that is removed.
---
 pp_pack.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index f6dde13d8ed1..a70e0937a270 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -282,8 +282,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
     STRLEN retlen;
     const char *from = *s;
     int bad = 0;
-    const U32 flags = ckWARN(WARN_UTF8) ?
-        UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY);
+    const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
     const bool needs_swap = NEEDS_SWAP(datumtype);
 
     if (UNLIKELY(needs_swap))
@@ -291,17 +290,18 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
 
     for (; buf_len > 0; buf_len--) {
         if (from >= end) return FALSE;
-        val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
-        if (retlen == (STRLEN) -1) {
-            from += UTF8_SAFE_SKIP(from, end);
+        if (! utf8_to_uv_flags((U8 *) from, (U8 *) end, &val, &retlen, flags))
+        {
             bad |= 1;
-        } else from += retlen;
+        }
 
             if (val >= 0x100) {
                 bad |= 2;
                 val = (U8) val;
             }
 
+        from += retlen;
+
         if (UNLIKELY(needs_swap))
             *(U8 *)--buf = (U8)val;
         else

From 90bf29ff8c0f74de78ea55fff859fe89ab934fd6 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 15:05:03 -0600
Subject: [PATCH 8/9] pp_pack.c: S_utf8_to_bytes: Rmv loop

Prior to this commit, the code went through the buffer with warnings
turned off, but saving the fact if there were problems.  If so, it ran
through the buffer again, with warnings turned on to display those
problems.

I'm unsure of why it didn't output any warnings until it got to the end.
But utf8_to_uv_msgs() allows us to store up the messages and output them
later, without having to reparse
---
 pp_pack.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index a70e0937a270..ce49773b7f91 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -288,20 +288,32 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
     if (UNLIKELY(needs_swap))
         buf += buf_len;
 
+    AV * msgs = NULL;
     for (; buf_len > 0; buf_len--) {
         if (from >= end) return FALSE;
-        if (! utf8_to_uv_flags((U8 *) from, (U8 *) end, &val, &retlen, flags))
-        {
-            bad |= 1;
-        }
 
+        AV * this_msgs = NULL;
+        if (utf8_to_uv_msgs((U8 *) from, (U8 *) end, &val, &retlen, flags,
+                            NULL, &this_msgs))
+        {
             if (val >= 0x100) {
                 bad |= 2;
                 val = (U8) val;
             }
+        }
 
         from += retlen;
 
+        /* Add any messages from this conversion to the list for later output
+         * */
+        if (this_msgs) {
+            while (av_count(this_msgs) > 0) {
+                av_push(msgs, av_shift(this_msgs));
+            }
+
+            Safefree(this_msgs);
+        }
+
         if (UNLIKELY(needs_swap))
             *(U8 *)--buf = (U8)val;
         else
@@ -309,25 +321,34 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
     }
 
     /* We have enough characters for the buffer. Did we have problems ? */
-    if (bad) {
-        if (bad & 1) {
-            /* Rewalk the string fragment while warning */
-            const char *ptr;
-            const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
-            for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) {
-                if (ptr >= end) break;
-                utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags);
-            }
-            if (from > end) from = end;
+    if (msgs) {
+        while (av_count(msgs) > 0) {
+            HV * msg_hash = (HV *) av_shift(msgs);
+            SV ** packed_categories_p = hv_fetchs(msg_hash, "warn_categories", 0);
+            if (packed_categories_p == NULL) {
+                continue;
+            }
+
+            UV packed_categories = SvUV(*packed_categories_p);
+            if (packed_categories == 0) {
+                continue;
+            }
+
+            SV ** warn_text_p = hv_fetchs(msg_hash, "text", 0);
+            if (warn_text_p) {
+                warner(packed_categories, "%s", SvPV_nolen(*warn_text_p));
+            }
         }
 
+        Safefree(msgs);
+    }
+
     if ((bad & 2))
         ck_warner(packWARN(datumtype & TYPE_IS_PACK ?
                             WARN_PACK : WARN_UNPACK),
                   "Character(s) in '%c' format wrapped in %s",
                   (int) TYPE_NO_MODIFIERS(datumtype),
                   datumtype & TYPE_IS_PACK ? "pack" : "unpack");
-    }
 
     *s = from;
     return TRUE;

From 6556ff3788d40866966dd4774c785f6ef0afe085 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 11 Aug 2025 15:15:56 -0600
Subject: [PATCH 9/9] pp_pack.c: Convert variable to bool

The reason for it to be more than this was removed in the previous
commit
---
 pp_pack.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pp_pack.c b/pp_pack.c
index ce49773b7f91..465d302ff982 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -281,7 +281,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
     UV val;
     STRLEN retlen;
     const char *from = *s;
-    int bad = 0;
+    bool bad = false;
     const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
     const bool needs_swap = NEEDS_SWAP(datumtype);
 
@@ -297,7 +297,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
                             NULL, &this_msgs))
         {
             if (val >= 0x100) {
-                bad |= 2;
+                bad = true;
                 val = (U8) val;
             }
         }
@@ -343,12 +343,13 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t
         Safefree(msgs);
     }
 
-    if ((bad & 2))
+    if (bad) {
         ck_warner(packWARN(datumtype & TYPE_IS_PACK ?
                             WARN_PACK : WARN_UNPACK),
                   "Character(s) in '%c' format wrapped in %s",
                   (int) TYPE_NO_MODIFIERS(datumtype),
                   datumtype & TYPE_IS_PACK ? "pack" : "unpack");
+    }
 
     *s = from;
     return TRUE;