Skip to content

Commit bcffa8c

Browse files
author
Branislav Zahradník
committed
malformed utf8 message: handy functions handling malformed utf8
- transpose die/warn argument into symbols (function name) - make `flags` argument optional - encapsulate lookup-and-die combo into single symbol (function)
1 parent b512fd8 commit bcffa8c

File tree

9 files changed

+166
-65
lines changed

9 files changed

+166
-65
lines changed

doop.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap)
369369
else {
370370
from = utf8_to_uvchr_buf(s, send, &s_len);
371371
if (from == 0 && *s != '\0') {
372-
_force_out_malformed_utf8_message(s, send, 0, MALFORMED_UTF8_DIE);
372+
force_out_malformed_utf8_die(s, send);
373373
}
374374
}
375375

@@ -486,7 +486,7 @@ S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap)
486486
else {
487487
from = utf8_to_uvchr_buf(s, send, &s_len);
488488
if (from == 0 && *s != '\0') {
489-
_force_out_malformed_utf8_message(s, send, 0, MALFORMED_UTF8_DIE);
489+
force_out_malformed_utf8_die(s, send);
490490
}
491491
}
492492

embed.fnc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,9 @@ EXop |char *|dup_warnings |NULLOK char *warnings
11151115
: Used by B
11161116
EXopx |void |emulate_cop_io |NN const COP * const c \
11171117
|NN SV * const sv
1118+
Cp |void |ensure_not_malformed_utf8 \
1119+
|NN const U8 * const start_pos \
1120+
|NN const STRLEN length
11181121
AOdp |SV * |eval_pv |NN const char *p \
11191122
|I32 croak_on_error
11201123
AOdp |SSize_t|eval_sv |NN SV *sv \
@@ -1175,11 +1178,25 @@ Adpx |void |forbid_outofblock_ops \
11751178
|NN OP *o \
11761179
|NN const char *blockname
11771180
p |void |force_locale_unlock
1181+
Cp |void |force_out_malformed_utf8_die \
1182+
|NN const U8 * const start_pos \
1183+
|NN const U8 * const end_pos
1184+
Cp |void |force_out_malformed_utf8_die_flags \
1185+
|NN const U8 * const start_pos \
1186+
|NN const U8 * const end_pos \
1187+
|const U32 flags
11781188
Cp |void |_force_out_malformed_utf8_message \
11791189
|NN const U8 * const p \
11801190
|NN const U8 * const e \
11811191
|const U32 flags \
11821192
|const bool die_here
1193+
Cp |void |force_out_malformed_utf8_warn \
1194+
|NN const U8 * const start_pos \
1195+
|NN const U8 * const end_pos
1196+
Cp |void |force_out_malformed_utf8_warn_flags \
1197+
|NN const U8 * const start_pos \
1198+
|NN const U8 * const end_pos \
1199+
|const U32 flags
11831200
Adfpv |char * |form |NN const char *pat \
11841201
|...
11851202
: Only used in perl.c

embed.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@
221221
# define dump_packsubs(a) Perl_dump_packsubs(aTHX_ a)
222222
# define dump_sub(a) Perl_dump_sub(aTHX_ a)
223223
# define dump_vindent(a,b,c,d) Perl_dump_vindent(aTHX_ a,b,c,d)
224+
# define ensure_not_malformed_utf8(a,b) Perl_ensure_not_malformed_utf8(aTHX_ a,b)
224225
# define eval_pv(a,b) Perl_eval_pv(aTHX_ a,b)
225226
# define eval_sv(a,b) Perl_eval_sv(aTHX_ a,b)
226227
# define fbm_compile(a,b) Perl_fbm_compile(aTHX_ a,b)
@@ -235,6 +236,10 @@
235236
# define foldEQ_locale(a,b,c) Perl_foldEQ_locale(aTHX_ a,b,c)
236237
# define foldEQ_utf8_flags(a,b,c,d,e,f,g,h,i) Perl_foldEQ_utf8_flags(aTHX_ a,b,c,d,e,f,g,h,i)
237238
# define forbid_outofblock_ops(a,b) Perl_forbid_outofblock_ops(aTHX_ a,b)
239+
# define force_out_malformed_utf8_die(a,b) Perl_force_out_malformed_utf8_die(aTHX_ a,b)
240+
# define force_out_malformed_utf8_die_flags(a,b,c) Perl_force_out_malformed_utf8_die_flags(aTHX_ a,b,c)
241+
# define force_out_malformed_utf8_warn(a,b) Perl_force_out_malformed_utf8_warn(aTHX_ a,b)
242+
# define force_out_malformed_utf8_warn_flags(a,b,c) Perl_force_out_malformed_utf8_warn_flags(aTHX_ a,b,c)
238243
# define free_tmps() Perl_free_tmps(aTHX)
239244
# define get_av(a,b) Perl_get_av(aTHX_ a,b)
240245
# define get_cv(a,b) Perl_get_cv(aTHX_ a,b)

handy.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2277,15 +2277,15 @@ END_EXTERN_C
22772277

22782278
#define generic_utf8_safe_(classnum, p, e, above_latin1) \
22792279
((! _utf8_safe_assert(p, e)) \
2280-
? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
2280+
? (force_out_malformed_utf8_die((U8 *) (p), (U8 *) (e)), 0) \
22812281
: (UTF8_IS_INVARIANT(*(p))) \
22822282
? generic_isCC_(*(p), classnum) \
22832283
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
22842284
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
22852285
? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \
22862286
classnum) \
2287-
: (_force_out_malformed_utf8_message( \
2288-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
2287+
: (force_out_malformed_utf8_die( \
2288+
(U8 *) (p), (U8 *) (e)), 0)) \
22892289
: above_latin1))
22902290
/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
22912291
* 'above_latin1' can be a macro */
@@ -2294,8 +2294,8 @@ END_EXTERN_C
22942294
#define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
22952295
generic_utf8_safe_(classnum, p, e, \
22962296
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2297-
? (_force_out_malformed_utf8_message( \
2298-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2297+
? (force_out_malformed_utf8_die( \
2298+
(U8 *) (p), (U8 *) (e)), 0) \
22992299
: above_latin1(p)))
23002300
/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
23012301
* 'above_latin1' parameter */
@@ -2384,8 +2384,8 @@ END_EXTERN_C
23842384
#define isXDIGIT_utf8_safe(p, e) \
23852385
generic_utf8_safe_no_upper_latin1_(CC_XDIGIT_, p, e, \
23862386
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2387-
? (_force_out_malformed_utf8_message( \
2388-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2387+
? (force_out_malformed_utf8_die( \
2388+
(U8 *) (p), (U8 *) (e)), 0) \
23892389
: is_XDIGIT_high(p)))
23902390

23912391
#define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l)
@@ -2433,8 +2433,8 @@ END_EXTERN_C
24332433
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
24342434
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
24352435
? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \
2436-
: (_force_out_malformed_utf8_message( \
2437-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
2436+
: (force_out_malformed_utf8_die( \
2437+
(U8 *) (p), (U8 *) (e)), 0)) \
24382438
: above_latin1))
24392439

24402440
#define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \
@@ -2447,8 +2447,8 @@ END_EXTERN_C
24472447
#define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
24482448
generic_LC_utf8_safe_(classnum, p, e, \
24492449
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2450-
? (_force_out_malformed_utf8_message( \
2451-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2450+
? (force_out_malformed_utf8_die( \
2451+
(U8 *) (p), (U8 *) (e)), 0) \
24522452
: above_latin1(p)))
24532453

24542454
#define isALPHANUMERIC_LC_utf8_safe(p, e) \

pp_pack.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3161,16 +3161,8 @@ PP_wrapped(pp_pack, 0, 1)
31613161
if (SvUTF8(cat)) {
31623162
STRLEN result_len;
31633163
const char * result = SvPV_nomg(cat, result_len);
3164-
const U8 * error_pos;
3165-
3166-
if (! is_utf8_string_loc((U8 *) result, result_len, &error_pos)) {
3167-
_force_out_malformed_utf8_message(error_pos,
3168-
(U8 *) result + result_len,
3169-
0, /* no flags */
3170-
MALFORMED_UTF8_DIE
3171-
);
3172-
NOT_REACHED; /* NOTREACHED */
3173-
}
3164+
3165+
ensure_not_malformed_utf8((U8 *) result, result_len);
31743166
}
31753167

31763168
SvSETMAGIC(cat);

proto.h

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

regexec.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10919,9 +10919,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
1091910919
const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
1092010920
c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
1092110921
if (c_len == (STRLEN)-1) {
10922-
_force_out_malformed_utf8_message(p, p_end,
10923-
utf8n_flags,
10924-
MALFORMED_UTF8_DIE);
10922+
force_out_malformed_utf8_die_flags(p, p_end, utf8n_flags);
1092510923
NOT_REACHED; /* NOTREACHED */
1092610924
}
1092710925
if ( c > 255

toke.c

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -902,20 +902,11 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
902902

903903
if (line) {
904904
Size_t len;
905-
const U8* first_bad_char_loc;
906905

907906
s = SvPV_const(line, len);
908907

909-
if ( SvUTF8(line)
910-
&& UNLIKELY(! is_utf8_string_loc((U8 *) s,
911-
SvCUR(line),
912-
&first_bad_char_loc)))
913-
{
914-
_force_out_malformed_utf8_message(first_bad_char_loc,
915-
(U8 *) s + SvCUR(line),
916-
0,
917-
MALFORMED_UTF8_DIE);
918-
NOT_REACHED; /* NOTREACHED */
908+
if (SvUTF8(line)) {
909+
ensure_not_malformed_utf8((U8 *) s, SvCUR(line));
919910
}
920911

921912
parser->linestr = flags & LEX_START_COPIED
@@ -1542,18 +1533,10 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
15421533
PL_parser->bufptr = buf + bufptr_pos;
15431534

15441535
if (UTF) {
1545-
const U8* first_bad_char_loc;
1546-
if (UNLIKELY(! is_utf8_string_loc(
1547-
(U8 *) PL_parser->bufptr,
1548-
PL_parser->bufend - PL_parser->bufptr,
1549-
&first_bad_char_loc)))
1550-
{
1551-
_force_out_malformed_utf8_message(first_bad_char_loc,
1552-
(U8 *) PL_parser->bufend,
1553-
0,
1554-
MALFORMED_UTF8_DIE);
1555-
NOT_REACHED; /* NOTREACHED */
1556-
}
1536+
ensure_not_malformed_utf8(
1537+
(U8 *) PL_parser->bufptr,
1538+
PL_parser->bufend - PL_parser->bufptr
1539+
);
15571540
}
15581541

15591542
PL_parser->oldbufptr = buf + oldbufptr_pos;
@@ -1631,10 +1614,7 @@ Perl_lex_peek_unichar(pTHX_ U32 flags)
16311614
}
16321615
unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
16331616
if (retlen == (STRLEN)-1) {
1634-
_force_out_malformed_utf8_message((U8 *) s,
1635-
(U8 *) bufend,
1636-
0,
1637-
MALFORMED_UTF8_DIE);
1617+
force_out_malformed_utf8_die((U8 *) s, (U8 *) bufend);
16381618
NOT_REACHED; /* NOTREACHED */
16391619
}
16401620
return unichar;
@@ -9695,16 +9675,8 @@ Perl_yylex(pTHX)
96959675
char *s = PL_bufptr;
96969676

96979677
if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
9698-
const U8* first_bad_char_loc;
9699-
if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
9700-
PL_bufend - PL_bufptr,
9701-
&first_bad_char_loc)))
9702-
{
9703-
_force_out_malformed_utf8_message(first_bad_char_loc,
9704-
(U8 *) PL_bufend,
9705-
0,
9706-
MALFORMED_UTF8_DIE);
9707-
NOT_REACHED; /* NOTREACHED */
9678+
if (UTF) {
9679+
ensure_not_malformed_utf8((U8 *) PL_bufptr, PL_bufend - PL_bufptr);
97089680
}
97099681
PL_parser->recheck_utf8_validity = FALSE;
97109682
}

utf8.c

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,98 @@ Perl__force_out_malformed_utf8_message(pTHX_
9494
}
9595
}
9696

97+
void
98+
Perl_ensure_not_malformed_utf8(
99+
pTHX_
100+
const U8 * const start_pos,
101+
const STRLEN length
102+
) {
103+
if (! length) return;
104+
105+
PERL_ARGS_ASSERT_ENSURE_NOT_MALFORMED_UTF8;
106+
107+
const U8* first_malformed_char_location;
108+
const bool has_malformed_char = ! is_utf8_string_loc(
109+
start_pos,
110+
length,
111+
&first_malformed_char_location
112+
);
113+
114+
if (UNLIKELY(has_malformed_char)) {
115+
force_out_malformed_utf8_die(
116+
first_malformed_char_location,
117+
start_pos + length
118+
);
119+
NOT_REACHED;
120+
}
121+
}
122+
123+
PERL_CALLCONV void
124+
Perl_force_out_malformed_utf8_die(
125+
pTHX_
126+
const U8 * const start_pos,
127+
const U8 * const end_pos
128+
) {
129+
PERL_ARGS_ASSERT_FORCE_OUT_MALFORMED_UTF8_DIE;
130+
131+
_force_out_malformed_utf8_message(
132+
start_pos,
133+
end_pos,
134+
0,
135+
MALFORMED_UTF8_DIE
136+
);
137+
}
138+
139+
PERL_CALLCONV void
140+
Perl_force_out_malformed_utf8_die_flags(
141+
pTHX_
142+
const U8 * const start_pos,
143+
const U8 * const end_pos,
144+
const U32 flags
145+
) {
146+
PERL_ARGS_ASSERT_FORCE_OUT_MALFORMED_UTF8_DIE_FLAGS;
147+
148+
_force_out_malformed_utf8_message(
149+
start_pos,
150+
end_pos,
151+
flags,
152+
MALFORMED_UTF8_DIE
153+
);
154+
}
155+
156+
PERL_CALLCONV void
157+
Perl_force_out_malformed_utf8_warn(
158+
pTHX_
159+
const U8 * const start_pos,
160+
const U8 * const end_pos
161+
) {
162+
PERL_ARGS_ASSERT_FORCE_OUT_MALFORMED_UTF8_WARN;
163+
164+
_force_out_malformed_utf8_message(
165+
start_pos,
166+
end_pos,
167+
0,
168+
MALFORMED_UTF8_WARN
169+
);
170+
}
171+
172+
PERL_CALLCONV void
173+
Perl_force_out_malformed_utf8_warn_flags(
174+
pTHX_
175+
const U8 * const start_pos,
176+
const U8 * const end_pos,
177+
const U32 flags
178+
) {
179+
PERL_ARGS_ASSERT_FORCE_OUT_MALFORMED_UTF8_WARN_FLAGS;
180+
181+
_force_out_malformed_utf8_message(
182+
start_pos,
183+
end_pos,
184+
flags,
185+
MALFORMED_UTF8_WARN
186+
);
187+
}
188+
97189
STATIC HV *
98190
S_new_msg_hv(pTHX_ const char * const message, /* The message text */
99191
U32 categories, /* Packed warning categories */
@@ -3308,7 +3400,7 @@ S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
33083400
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
33093401

33103402
if (cp == 0 && (p >= e || *p != '\0')) {
3311-
_force_out_malformed_utf8_message(p, e, 0, MALFORMED_UTF8_DIE);
3403+
force_out_malformed_utf8_die(p, e);
33123404
NOT_REACHED; /* NOTREACHED */
33133405
}
33143406

@@ -3853,7 +3945,7 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
38533945
STRLEN len_result; \
38543946
result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
38553947
if (len_result == (STRLEN) -1) { \
3856-
_force_out_malformed_utf8_message(p, e, 0, MALFORMED_UTF8_DIE ); \
3948+
force_out_malformed_utf8_die(p, e); \
38573949
}
38583950

38593951
#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \

0 commit comments

Comments
 (0)