Skip to content

Commit cbcdf34

Browse files
[3.13] gh-53203: Fix strptime() for %c, %x and %X formats on many locales (GH-125406) (GH-125454)
Fixed most locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan. (cherry picked from commit 5f4e5b5) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 7966c7d commit cbcdf34

File tree

4 files changed

+77
-43
lines changed

4 files changed

+77
-43
lines changed

Lib/_strptime.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import locale
1616
import calendar
1717
from re import compile as re_compile
18+
from re import sub as re_sub
1819
from re import IGNORECASE
1920
from re import escape as re_escape
2021
from datetime import (date as datetime_date,
@@ -129,11 +130,23 @@ def __calc_date_time(self):
129130
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
130131
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
131132
replacement_pairs = [
132-
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
133-
('44', '%M'), ('55', '%S'), ('76', '%j'),
134-
('17', '%d'), ('03', '%m'), ('3', '%m'),
135-
# '3' needed for when no leading zero.
136-
('2', '%w'), ('10', '%I')]
133+
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
134+
('44', '%M'), ('55', '%S'), ('76', '%j'),
135+
('17', '%d'), ('03', '%m'), ('3', '%m'),
136+
# '3' needed for when no leading zero.
137+
('2', '%w'), ('10', '%I'),
138+
# Non-ASCII digits
139+
('\u0661\u0669\u0669\u0669', '%Y'),
140+
('\u0669\u0669', '%Oy'),
141+
('\u0662\u0662', '%OH'),
142+
('\u0664\u0664', '%OM'),
143+
('\u0665\u0665', '%OS'),
144+
('\u0661\u0667', '%Od'),
145+
('\u0660\u0663', '%Om'),
146+
('\u0663', '%Om'),
147+
('\u0662', '%Ow'),
148+
('\u0661\u0660', '%OI'),
149+
]
137150
date_time = []
138151
for directive in ('%c', '%x', '%X'):
139152
current_format = time.strftime(directive, time_tuple).lower()
@@ -158,6 +171,10 @@ def __calc_date_time(self):
158171
for tz in tz_values:
159172
if tz:
160173
current_format = current_format.replace(tz, "%Z")
174+
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
175+
current_format = re_sub(r'\d(?<![0-9])',
176+
lambda m: chr(0x0660 + int(m[0])),
177+
current_format)
161178
for old, new in replacement_pairs:
162179
current_format = current_format.replace(old, new)
163180
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@@ -267,7 +284,7 @@ def __init__(self, locale_time=None):
267284
else:
268285
self.locale_time = LocaleTime()
269286
base = super()
270-
base.__init__({
287+
mapping = {
271288
# The " [1-9]" part of the regex is to make %c from ANSI C work
272289
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
273290
'f': r"(?P<f>[0-9]{1,6})",
@@ -296,11 +313,15 @@ def __init__(self, locale_time=None):
296313
'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
297314
for tz in tz_names),
298315
'Z'),
299-
'%': '%'})
300-
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
301-
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
302-
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
316+
'%': '%'}
317+
for d in 'dmyHIMS':
318+
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
319+
mapping['Ow'] = r'(?P<w>\d)'
320+
mapping['W'] = mapping['U'].replace('U', 'W')
321+
base.__init__(mapping)
303322
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
323+
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
324+
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
304325

305326
def __seqToRE(self, to_convert, directive):
306327
"""Convert a list to a regex string for matching a directive.
@@ -328,28 +349,25 @@ def pattern(self, format):
328349
regex syntax are escaped.
329350
330351
"""
331-
processed_format = ''
332352
# The sub() call escapes all characters that might be misconstrued
333353
# as regex syntax. Cannot use re.escape since we have to deal with
334354
# format directives (%m, etc.).
335-
regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
336-
format = regex_chars.sub(r"\\\1", format)
337-
whitespace_replacement = re_compile(r'\s+')
338-
format = whitespace_replacement.sub(r'\\s+', format)
355+
format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format)
356+
format = re_sub(r'\s+', r'\\s+', format)
357+
format = re_sub(r"'", "['\u02bc]", format) # needed for br_FR
339358
year_in_format = False
340359
day_of_month_in_format = False
341-
while '%' in format:
342-
directive_index = format.index('%')+1
343-
format_char = format[directive_index]
344-
processed_format = "%s%s%s" % (processed_format,
345-
format[:directive_index-1],
346-
self[format_char])
347-
format = format[directive_index+1:]
360+
def repl(m):
361+
format_char = m[1]
348362
match format_char:
349363
case 'Y' | 'y' | 'G':
364+
nonlocal year_in_format
350365
year_in_format = True
351366
case 'd':
367+
nonlocal day_of_month_in_format
352368
day_of_month_in_format = True
369+
return self[format_char]
370+
format = re_sub(r'%(O?.)', repl, format)
353371
if day_of_month_in_format and not year_in_format:
354372
import warnings
355373
warnings.warn("""\
@@ -360,7 +378,7 @@ def pattern(self, format):
360378
See https://github.com/python/cpython/issues/70647.""",
361379
DeprecationWarning,
362380
skip_file_prefixes=(os.path.dirname(__file__),))
363-
return "%s%s" % (processed_format, format)
381+
return format
364382

365383
def compile(self, format):
366384
"""Return a compiled re object for the format string."""
@@ -434,8 +452,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
434452
_regex_cache[format] = format_regex
435453
found = format_regex.match(data_string)
436454
if not found:
437-
raise ValueError("time data %r does not match format %r :: /%s/" %
438-
(data_string, format, format_regex.pattern))
455+
raise ValueError("time data %r does not match format %r" %
456+
(data_string, format))
439457
if len(data_string) != found.end():
440458
raise ValueError("unconverted data remains: %s" %
441459
data_string[found.end():])

Lib/test/test_strptime.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def test_strptime_exception_context(self):
292292
# additional check for IndexError branch (issue #19545)
293293
with self.assertRaises(ValueError) as e:
294294
_strptime._strptime_time('19', '%Y %')
295-
self.assertIs(e.exception.__suppress_context__, True)
295+
self.assertIsNone(e.exception.__context__)
296296

297297
def test_unconverteddata(self):
298298
# Check ValueError is raised when there is unconverted data
@@ -485,12 +485,14 @@ def test_bad_timezone(self):
485485
# id_ID, ms_MY.
486486
# * Year is not included: ha_NG.
487487
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
488+
# On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
488489
#
489490
# BUG: Generates regexp that does not match the current date and time
490-
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
491+
# for lzh_TW.
491492
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
492493
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
493-
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
494+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
495+
'my_MM', 'or_IN', 'shn_MM', 'az_IR')
494496
def test_date_time_locale(self):
495497
# Test %c directive
496498
loc = locale.getlocale(locale.LC_TIME)[0]
@@ -512,20 +514,23 @@ def test_date_time_locale(self):
512514
self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600))
513515

514516
# NB: Dates before 1969 do not roundtrip on some locales:
515-
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
517+
# az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN.
516518
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
517519
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
518-
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
520+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
521+
'my_MM', 'shn_MM')
519522
def test_date_time_locale2(self):
520523
# Test %c directive
521524
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
525+
self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
522526

523527
# NB: Does not roundtrip because use non-Gregorian calendar:
524-
# lo_LA, thai, th_TH.
528+
# lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
525529
# BUG: Generates regexp that does not match the current date
526-
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
530+
# for lzh_TW.
527531
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
528-
'he_IL', 'eu_ES', 'ar_AE')
532+
'he_IL', 'eu_ES', 'ar_AE',
533+
'az_IR', 'my_MM', 'or_IN', 'shn_MM')
529534
def test_date_locale(self):
530535
# Test %x directive
531536
now = time.time()
@@ -545,30 +550,39 @@ def test_date_locale(self):
545550
"musl libc issue on Emscripten, bpo-46390"
546551
)
547552
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
548-
'eu_ES', 'ar_AE')
553+
'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
549554
def test_date_locale2(self):
550555
# Test %x directive
551556
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
557+
self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
552558

553559
# NB: Does not roundtrip in some locales due to the ambiguity of
554560
# the time representation (bugs in locales?):
555561
# * Seconds are not included: bokmal, ff_SN, nb_NO, nn_NO, no_NO,
556562
# norwegian, nynorsk.
557563
# * Hours are in 12-hour notation without AM/PM indication: hy_AM,
558564
# ms_MY, sm_WS.
559-
# BUG: Generates regexp that does not match the current time for
560-
# aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET,
561-
# lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET,
562-
# so_SO, ti_ER, ti_ET, tig_ER, wal_ET.
563-
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
565+
# BUG: Generates regexp that does not match the current time for lzh_TW.
566+
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
567+
'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
568+
'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
569+
'ti_ET', 'tig_ER', 'wal_ET')
564570
def test_time_locale(self):
565571
# Test %X directive
572+
loc = locale.getlocale(locale.LC_TIME)[0]
573+
pos = slice(3, 6)
574+
if glibc_ver and glibc_ver < (2, 29) and loc in {
575+
'aa_ET', 'am_ET', 'byn_ER', 'gez_ET', 'om_ET',
576+
'sid_ET', 'so_SO', 'ti_ET', 'tig_ER', 'wal_ET'}:
577+
# Hours are in 12-hour notation without AM/PM indication.
578+
# Ignore hours.
579+
pos = slice(4, 6)
566580
now = time.time()
567-
self.roundtrip('%X', slice(3, 6), time.localtime(now))
581+
self.roundtrip('%X', pos, time.localtime(now))
568582
# 1 hour 20 minutes 30 seconds ago
569-
self.roundtrip('%X', slice(3, 6), time.localtime(now - 4830))
583+
self.roundtrip('%X', pos, time.localtime(now - 4830))
570584
# 12 hours ago
571-
self.roundtrip('%X', slice(3, 6), time.localtime(now - 12*3600))
585+
self.roundtrip('%X', pos, time.localtime(now - 12*3600))
572586

573587
def test_percent(self):
574588
# Make sure % signs are handled properly

Lib/test/test_time.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def test_strptime_exception_context(self):
298298
# additional check for IndexError branch (issue #19545)
299299
with self.assertRaises(ValueError) as e:
300300
time.strptime('19', '%Y %')
301-
self.assertIs(e.exception.__suppress_context__, True)
301+
self.assertIsNone(e.exception.__context__)
302302

303303
def test_strptime_leap_year(self):
304304
# GH-70647: warns if parsing a format with a day and no year.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many
2+
locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.

0 commit comments

Comments
 (0)