gh-114667: Support hexadecimal floating point literals

skirpichev · skirpichev · commit 220be10d733a · 2024-01-28T10:26:56.000+03:00
This add hexadecimal floating point literals (IEEE 754-2008 §5.12.3) and
support construction of floats from hexadecimal strings.  Note that the
syntax is more permissive: everything that is currently accepted by the
``float.fromhex()``, but with a mandatory base specifier; it also allows
grouping digits with underscores.

Examples:
```pycon
&gt;&gt;&gt; 0x1.1p-1
0.53125
&gt;&gt;&gt; float('0x1.1')
1.0625
&gt;&gt;&gt; 0x1.1
1.0625
&gt;&gt;&gt; 0x1.1_1_1
1.066650390625
```

Minor changes:
* Py_ISDIGIT/ISXDIGIT macros were transformed to functions
diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst
@@ -656,7 +656,8 @@ are always available.  They are listed here in alphabetical order.
 
    Return a floating point number constructed from a number or string *x*.
 
-   If the argument is a string, it should contain a decimal number, optionally
+   If the argument is a string, it should contain a decimal number
+   or a hexadecimal number, optionally
    preceded by a sign, and optionally embedded in whitespace.  The optional
    sign may be ``'+'`` or ``'-'``; a ``'+'`` sign has no effect on the value
    produced.  The argument may also be a string representing a NaN
@@ -672,11 +673,15 @@ are always available.  They are listed here in alphabetical order.
       digitpart: `digit` (["_"] `digit`)*
       number: [`digitpart`] "." `digitpart` | `digitpart` ["."]
       exponent: ("e" | "E") ["+" | "-"] `digitpart`
-      floatnumber: number [`exponent`]
+      hexfloatnumber: `~python-grammar:hexinteger` | `~python-grammar:hexfraction` | `~python-grammar:hexfloat`
+      floatnumber: (`number` [`exponent`]) | `hexfloatnumber`
       floatvalue: [`sign`] (`floatnumber` | `infinity` | `nan`)
 
    Case is not significant, so, for example, "inf", "Inf", "INFINITY", and
-   "iNfINity" are all acceptable spellings for positive infinity.
+   "iNfINity" are all acceptable spellings for positive infinity.  Note also
+   that the exponent of a hexadecimal floating point number is written in
+   decimal, and that it gives the power of 2 by which to multiply the
+   coefficient.
 
    Otherwise, if the argument is an integer or a floating point number, a
    floating point number with the same value (within Python's floating point
@@ -713,6 +718,9 @@ are always available.  They are listed here in alphabetical order.
    .. versionchanged:: 3.8
       Falls back to :meth:`~object.__index__` if :meth:`~object.__float__` is not defined.
 
+   .. versionchanged:: 3.13
+      Added support for hexadecimal floating-point numbers.
+
 
 .. index::
    single: __format__
diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst
@@ -951,25 +951,36 @@ Floating point literals
 Floating point literals are described by the following lexical definitions:
 
 .. productionlist:: python-grammar
-   floatnumber: `pointfloat` | `exponentfloat`
+   floatnumber: `pointfloat` | `exponentfloat` | `hexfloat`
    pointfloat: [`digitpart`] `fraction` | `digitpart` "."
    exponentfloat: (`digitpart` | `pointfloat`) `exponent`
+   hexfloat: ("0x | "0X") ["_"] (`hexdigitpart` | `hexpointfloat`) [`binexponent`]
    digitpart: `digit` (["_"] `digit`)*
    fraction: "." `digitpart`
    exponent: ("e" | "E") ["+" | "-"] `digitpart`
+   hexpointfloat: [`hexdigit`] `hexfraction` | `hexdigitpart` "."
+   hexfraction: "." `hexdigitpart`
+   hexdigitpart: `hexdigit` (["_"] `hexdigit`)*
+   binexponent: ("p" | "P") ["+" | "-"] `digitpart`
 
-Note that the integer and exponent parts are always interpreted using radix 10.
+Note that the exponent parts are always interpreted using radix 10.
 For example, ``077e010`` is legal, and denotes the same number as ``77e10``. The
 allowed range of floating point literals is implementation-dependent.  As in
 integer literals, underscores are supported for digit grouping.
 
+The exponent of a hexadecimal floating point literal is written in decimal, and
+it gives the power of 2 by which to multiply the coefficient.
+
 Some examples of floating point literals::
 
    3.14    10.    .001    1e100    3.14e-10    0e0    3.14_15_93
 
 .. versionchanged:: 3.6
    Underscores are now allowed for grouping purposes in literals.
 
+.. versionchanged:: 3.13
+   Added support for hexadecimal floating-point literals.
+
 
 .. index::
    single: j; in numeric literal
diff --git a/Doc/tutorial/floatingpoint.rst b/Doc/tutorial/floatingpoint.rst
@@ -210,7 +210,7 @@ the float value exactly:
 
 .. doctest::
 
-    >>> x == float.fromhex('0x1.921f9f01b866ep+1')
+    >>> x == 0x1.921f9f01b866ep+1
     True
 
 Since the representation is exact, it is useful for reliably porting values
diff --git a/Include/cpython/pyctype.h b/Include/cpython/pyctype.h
@@ -21,11 +21,17 @@ PyAPI_DATA(const unsigned int) _Py_ctype_table[256];
 #define Py_ISLOWER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_LOWER)
 #define Py_ISUPPER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_UPPER)
 #define Py_ISALPHA(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALPHA)
-#define Py_ISDIGIT(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT)
-#define Py_ISXDIGIT(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT)
 #define Py_ISALNUM(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALNUM)
 #define Py_ISSPACE(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_SPACE)
 
+static inline int Py_ISDIGIT(char c) {
+    return _Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT;
+}
+
+static inline int Py_ISXDIGIT(char c) {
+    return _Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT;
+}
+
 PyAPI_DATA(const unsigned char) _Py_ctype_tolower[256];
 PyAPI_DATA(const unsigned char) _Py_ctype_toupper[256];
 
diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h
@@ -56,6 +56,7 @@ extern PyObject* _Py_string_to_number_with_underscores(
 
 extern double _Py_parse_inf_or_nan(const char *p, char **endptr);
 
+extern double _Py_dg_strtod_hex(const char *str, char **ptr);
 
 #ifdef __cplusplus
 }
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
@@ -38,9 +38,9 @@ def test_float(self):
         self.assertEqual(float(3.14), 3.14)
         self.assertEqual(float(314), 314.0)
         self.assertEqual(float("  3.14  "), 3.14)
-        self.assertRaises(ValueError, float, "  0x3.1  ")
-        self.assertRaises(ValueError, float, "  -0x3.p-1  ")
-        self.assertRaises(ValueError, float, "  +0x3.p-1  ")
+        self.assertEqual(float("  0x3.1  "), 3.0625)
+        self.assertEqual(float("  -0x3.p-1  "), -1.5)
+        self.assertEqual(float("  +0x3.p-1  "), 1.5)
         self.assertRaises(ValueError, float, "++3.14")
         self.assertRaises(ValueError, float, "+-3.14")
         self.assertRaises(ValueError, float, "-+3.14")
@@ -70,13 +70,13 @@ def test_noargs(self):
 
     def test_underscores(self):
         for lit in VALID_UNDERSCORE_LITERALS:
-            if not any(ch in lit for ch in 'jJxXoObB'):
+            if not any(ch in lit for ch in 'jJoObB'):
                 self.assertEqual(float(lit), eval(lit))
                 self.assertEqual(float(lit), float(lit.replace('_', '')))
         for lit in INVALID_UNDERSCORE_LITERALS:
             if lit in ('0_7', '09_99'):  # octals are not recognized here
                 continue
-            if not any(ch in lit for ch in 'jJxXoObB'):
+            if not any(ch in lit for ch in 'jJoObB'):
                 self.assertRaises(ValueError, float, lit)
         # Additional test cases; nan and inf are never valid as literals,
         # only in the float() constructor, but we don't allow underscores
@@ -173,9 +173,9 @@ def test_float_with_comma(self):
         self.assertRaises(ValueError, float, "  3,14  ")
         self.assertRaises(ValueError, float, "  +3,14  ")
         self.assertRaises(ValueError, float, "  -3,14  ")
-        self.assertRaises(ValueError, float, "  0x3.1  ")
-        self.assertRaises(ValueError, float, "  -0x3.p-1  ")
-        self.assertRaises(ValueError, float, "  +0x3.p-1  ")
+        self.assertEqual(float("  0x3.1  "), 3.0625)
+        self.assertEqual(float("  -0x3.p-1  "), -1.5)
+        self.assertEqual(float("  +0x3.p-1  "), 1.5)
         self.assertEqual(float("  25.e-1  "), 2.5)
         self.assertAlmostEqual(float("  .25e-1  "), .025)
 
@@ -1483,7 +1483,7 @@ def roundtrip(x):
             except OverflowError:
                 pass
             else:
-                self.identical(x, fromHex(toHex(x)))
+                self.identical(x, roundtrip(x))
 
     def test_subclass(self):
         class F(float):
diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py
@@ -19,8 +19,11 @@
 
 # These are shared with test_tokenize and other test modules.
 #
-# Note: since several test cases filter out floats by looking for "e" and ".",
-# don't add hexadecimal literals that contain "e" or "E".
+# Note:
+# 1) several test cases filter out floats by looking for "e" and ".":
+#    don't add hexadecimal literals that contain "e" or "E".
+# 2) several tests also filter out binary integers by looking for "b" or "B":
+#    so, don't add hexadecimal floating point literals with above digits.
 VALID_UNDERSCORE_LITERALS = [
     '0_0_0',
     '4_2',
@@ -43,6 +46,16 @@
     '.1_4j',
     '(1_2.5+3_3j)',
     '(.5_6j)',
+    '0x_.1p1',
+    '0X_.1p1',
+    '0x1_1.p1',
+    '0x_1_1.p1',
+    '0x1.1_1p1',
+    '0x1.p1_1',
+    '0xa.p1',
+    '0x.ap1',
+    '0xa_c.p1',
+    '0x.a_cp1',
 ]
 INVALID_UNDERSCORE_LITERALS = [
     # Trailing underscores:
@@ -54,6 +67,8 @@
     '0xf_',
     '0o5_',
     '0 if 1_Else 1',
+    '0x1p1_',
+    '0x1.1p1_',
     # Underscores in the base selector:
     '0_b0',
     '0_xf',
@@ -71,28 +86,41 @@
     '0o5__77',
     '1e1__0',
     '1e1__0j',
+    '0x1__1.1p1',
     # Underscore right before a dot:
     '1_.4',
     '1_.4j',
+    '0x1_.p1',
+    '0xa_.p1',
     # Underscore right after a dot:
     '1._4',
     '1._4j',
     '._5',
     '._5j',
+    '0x1._p1',
+    '0xa._p1',
     # Underscore right after a sign:
     '1.0e+_1',
     '1.0e+_1j',
+    '0x1.1p+_1',
     # Underscore right before j:
     '1.4_j',
     '1.4e5_j',
-    # Underscore right before e:
+    '0x1.1p1_j',
+    # Underscore right before e or p:
     '1_e1',
     '1.4_e1',
     '1.4_e1j',
-    # Underscore right after e:
+    '0x1_p1',
+    '0x1_P1',
+    '0x1.1_p1',
+    '0x1.1_P1',
+    # Underscore right after e or p:
     '1e_1',
     '1.4e_1',
     '1.4e_1j',
+    '0x1p_1',
+    '0x1.1p_1',
     # Complex cases with parens:
     '(1+1.5_j_)',
     '(1+1.5_j)',
@@ -173,6 +201,23 @@ def test_floats(self):
         x = 3.e14
         x = .3e14
         x = 3.1e4
+        x = 0x1.2p1
+        x = 0x1.2p+1
+        x = 0x1.p1
+        x = 0x1.p-1
+        x = 0x1p0
+        x = 0x1ap1
+        x = 0x1P1
+        x = 0x1cp2
+        x = 0x1.p1
+        x = 0x1.P1
+        x = 0x001.1p2
+        x = 0X1p1
+        x = 0x1.1_1p1
+        x = 0x1.1p1_1
+        x = 0x1.
+        x = 0x1.1
+        x = 0x.1
 
     def test_float_exponent_tokenization(self):
         # See issue 21642.
@@ -210,7 +255,14 @@ def test_bad_numerical_literals(self):
               "use an 0o prefix for octal integers")
         check("1.2_", "invalid decimal literal")
         check("1e2_", "invalid decimal literal")
-        check("1e+", "invalid decimal literal")
+        check("1e+", "invalid float literal")
+        check("0x.p", "invalid float literal")
+        check("0x_.p", "invalid float literal")
+        check("0x1.1p", "invalid float literal")
+        check("0x1.1_p", "invalid float literal")
+        check("0x1.1p_", "invalid float literal")
+        check("0xp", "invalid hexadecimal literal")
+        check("0xP", "invalid hexadecimal literal")
 
     def test_end_of_numerical_literals(self):
         def check(test, error=False):
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -265,6 +265,16 @@ def test_float(self):
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14e159'    (1, 4) (1, 12)
+    """)
+        self.check_tokenize("x = 0x1p1", """\
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    NUMBER     '0x1p1'       (1, 4) (1, 9)
+    """)
+        self.check_tokenize("x = 0x.1p1", """\
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    NUMBER     '0x.1p1'      (1, 4) (1, 10)
     """)
 
     def test_underscore_literals(self):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -77,7 +77,10 @@ def maybe(*choices): return group(*choices) + '?'
 Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
                    r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
 Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
+HexExponent = r'[pP][-+]?[0-9](?:_?[0-9])*'
+Hexfloat = group(r'0[xX]_?[0-9a-f](?:_?[0-9a-f])*\.(?:[0-9a-f](?:_?[0-9a-f])*)?',
+                 r'0[xX]_?\.[0-9a-f](?:_?[0-9a-f])*') + HexExponent
+Floatnumber = group(Pointfloat, Expfloat, Hexfloat)
 Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
 
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst b/Misc/NEWS.d/next/Core and Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst
@@ -0,0 +1,3 @@
+Add hexadecimal floating point literals (IEEE 754-2008 §5.12.3) and support
+construction of floats from hexadecimal strings.  Patch by Sergey B
+Kirpichev.
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
diff --git a/Python/dtoa.c b/Python/dtoa.c
diff --git a/Python/pystrtod.c b/Python/pystrtod.c

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ extern PyObject* _Py_string_to_number_with_underscores(`
`56`	`56`
`57`	`57`	`extern double _Py_parse_inf_or_nan(const char p, char *endptr);`
`58`	`58`
	`59`	`+extern double _Py_dg_strtod_hex(const char str, char *ptr);`
`59`	`60`
`60`	`61`	`#ifdef __cplusplus`
`61`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Add hexadecimal floating point literals (IEEE 754-2008 §5.12.3) and support`
	`2`	`+construction of floats from hexadecimal strings. Patch by Sergey B`
	`3`	`+Kirpichev.`