Skip to content

Commit f5a2c5e

Browse files
committed
bpo-15216: TextIOWrapper support set encoding, errors and newline after creation
1 parent b8a7daf commit f5a2c5e

File tree

4 files changed

+561
-96
lines changed

4 files changed

+561
-96
lines changed

Doc/library/io.rst

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,13 +901,26 @@ Text I/O
901901
locale encoding using :func:`locale.setlocale`, use the current locale
902902
encoding instead of the user preferred encoding.
903903

904-
:class:`TextIOWrapper` provides one attribute in addition to those of
904+
:class:`TextIOWrapper` provides these members in addition to those of
905905
:class:`TextIOBase` and its parents:
906906

907907
.. attribute:: line_buffering
908908

909909
Whether line buffering is enabled.
910910

911+
.. method:: set_encoding(encoding=None, errors=None[, newline])
912+
913+
Change the encoding, error handler, and newline handler.
914+
If *encoding* is None or *newline* is unspecified, the existing
915+
setting is retained. If *errors* is None, the default depends on
916+
*encoding*: if *encoding* is also None, the existing error handler
917+
is retained, otherwise it is reset to ``'strict'``.
918+
919+
It is not possible to change the encoding if some data has already
920+
been read from the stream.
921+
922+
.. versionadded:: 3.7
923+
911924

912925
.. class:: StringIO(initial_value='', newline='\\n')
913926

Lib/_pyio.py

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1946,11 +1946,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
19461946
self._line_buffering = line_buffering
19471947
self._encoding = encoding
19481948
self._errors = errors
1949-
self._readuniversal = not newline
1950-
self._readtranslate = newline is None
1951-
self._readnl = newline
1952-
self._writetranslate = newline != ''
1953-
self._writenl = newline or os.linesep
1949+
self._set_newline(newline)
19541950
self._encoder = None
19551951
self._decoder = None
19561952
self._decoded_chars = '' # buffer for text returned from decoder
@@ -1995,6 +1991,65 @@ def __repr__(self):
19951991
result += " mode={0!r}".format(mode)
19961992
return result + " encoding={0!r}>".format(self.encoding)
19971993

1994+
def set_encoding(self, encoding=None, errors=None, newline=Ellipsis):
1995+
"""Change the encoding of the stream.
1996+
1997+
It is not possible to change the encoding if some data has already
1998+
been read from the stream.
1999+
"""
2000+
old_encoding = codecs.lookup(self._encoding).name
2001+
if encoding is None:
2002+
encoding = old_encoding
2003+
if errors is None:
2004+
errors = self._errors
2005+
else:
2006+
if not isinstance(encoding, str):
2007+
raise ValueError("invalid encoding: %r" % encoding)
2008+
2009+
if errors is None:
2010+
errors = 'strict'
2011+
2012+
encoding = codecs.lookup(encoding).name
2013+
if newline is Ellipsis:
2014+
newline = self._readnl
2015+
if encoding == old_encoding and errors == self._errors \
2016+
and newline == self._readnl:
2017+
# no change
2018+
return
2019+
2020+
if self._decoder is not None:
2021+
raise UnsupportedOperation(
2022+
"It is not possible to set the encoding of stream after "
2023+
"the first read")
2024+
2025+
# flush write buffer
2026+
self.flush()
2027+
2028+
# reset attributes
2029+
self._encoding = encoding
2030+
self._errors = errors
2031+
self._encoder = None
2032+
self._decoder = None
2033+
self._b2cratio = 0.0
2034+
self._set_newline(newline)
2035+
2036+
# don't write a BOM in the middle of a file
2037+
if self._seekable and self.writable():
2038+
position = self.buffer.tell()
2039+
if position != 0:
2040+
try:
2041+
self._get_encoder().setstate(0)
2042+
except LookupError:
2043+
# Sometimes the encoder doesn't exist
2044+
pass
2045+
2046+
def _set_newline(self, newline):
2047+
self._readuniversal = not newline
2048+
self._readtranslate = newline is None
2049+
self._readnl = newline
2050+
self._writetranslate = newline != ''
2051+
self._writenl = newline or os.linesep
2052+
19982053
@property
19992054
def encoding(self):
20002055
return self._encoding

Lib/test/test_io.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3242,6 +3242,134 @@ def seekable(self): return True
32423242
F.tell = lambda x: 0
32433243
t = self.TextIOWrapper(F(), encoding='utf-8')
32443244

3245+
def test_set_encoding_same_codec(self):
3246+
data = 'foobar\n'.encode('latin1')
3247+
raw = self.BytesIO(data)
3248+
txt = self.TextIOWrapper(raw, encoding='latin1')
3249+
self.assertEqual(txt.encoding, 'latin1')
3250+
3251+
# Just an alias, shouldn't change anything
3252+
txt.set_encoding('ISO-8859-1')
3253+
self.assertEqual(txt.encoding, 'latin1')
3254+
3255+
# This is an actual change
3256+
txt.set_encoding('iso8859-15')
3257+
self.assertEqual(txt.encoding, 'iso8859-15')
3258+
3259+
def test_set_encoding_read(self):
3260+
# latin1 -> utf8
3261+
# (latin1 can decode utf-8 encoded string)
3262+
data = 'abc\xe9\n'.encode('latin1') + 'd\xe9f\n'.encode('utf8')
3263+
raw = self.BytesIO(data)
3264+
txt = self.TextIOWrapper(raw, encoding='latin1', newline='\n')
3265+
self.assertEqual(txt.readline(), 'abc\xe9\n')
3266+
with self.assertRaises(self.UnsupportedOperation):
3267+
txt.set_encoding('utf-8')
3268+
3269+
def test_set_encoding_write_fromascii(self):
3270+
# ascii has a specific encodefunc in the C implementation,
3271+
# but utf-8-sig has not. Make sure that we get rid of the
3272+
# cached encodefunc when we switch encoders.
3273+
raw = self.BytesIO()
3274+
txt = self.TextIOWrapper(raw, encoding='ascii', newline='\n')
3275+
txt.write('foo\n')
3276+
txt.set_encoding('utf-8-sig')
3277+
txt.write('\xe9\n')
3278+
txt.flush()
3279+
self.assertEqual(raw.getvalue(), b'foo\n\xc3\xa9\n')
3280+
3281+
def test_set_encoding_write(self):
3282+
# latin -> utf8
3283+
raw = self.BytesIO()
3284+
txt = self.TextIOWrapper(raw, encoding='latin1', newline='\n')
3285+
txt.write('abc\xe9\n')
3286+
txt.set_encoding('utf-8')
3287+
self.assertEqual(raw.getvalue(), b'abc\xe9\n')
3288+
txt.write('d\xe9f\n')
3289+
txt.flush()
3290+
self.assertEqual(raw.getvalue(), b'abc\xe9\nd\xc3\xa9f\n')
3291+
3292+
# ascii -> utf-8-sig: ensure that no BOM is written in the middle of
3293+
# the file
3294+
raw = self.BytesIO()
3295+
txt = self.TextIOWrapper(raw, encoding='ascii', newline='\n')
3296+
txt.write('abc\n')
3297+
txt.set_encoding('utf-8-sig')
3298+
txt.write('d\xe9f\n')
3299+
txt.flush()
3300+
self.assertEqual(raw.getvalue(), b'abc\nd\xc3\xa9f\n')
3301+
3302+
def test_set_encoding_write_non_seekable(self):
3303+
raw = self.BytesIO()
3304+
raw.seekable = lambda: False
3305+
raw.seek = None
3306+
txt = self.TextIOWrapper(raw, encoding='ascii', newline='\n')
3307+
txt.write('abc\n')
3308+
txt.set_encoding('utf-8-sig')
3309+
txt.write('d\xe9f\n')
3310+
txt.flush()
3311+
3312+
# If the raw stream is not seekable, there'll be a BOM
3313+
self.assertEqual(raw.getvalue(), b'abc\n\xef\xbb\xbfd\xc3\xa9f\n')
3314+
3315+
def test_set_encoding_defaults(self):
3316+
txt = self.TextIOWrapper(self.BytesIO(), 'ascii', 'replace', '\n')
3317+
txt.set_encoding(None, None)
3318+
self.assertEqual(txt.encoding, 'ascii')
3319+
self.assertEqual(txt.errors, 'replace')
3320+
txt.write('LF\n')
3321+
3322+
txt.set_encoding(newline='\r\n')
3323+
self.assertEqual(txt.encoding, 'ascii')
3324+
self.assertEqual(txt.errors, 'replace')
3325+
3326+
txt.set_encoding(errors='ignore')
3327+
self.assertEqual(txt.encoding, 'ascii')
3328+
txt.write('CRLF\n')
3329+
3330+
txt.set_encoding(encoding='utf-8', newline=None)
3331+
self.assertEqual(txt.errors, 'strict')
3332+
txt.seek(0)
3333+
self.assertEqual(txt.read(), 'LF\nCRLF\n')
3334+
3335+
self.assertEqual(txt.detach().getvalue(), b'LF\nCRLF\r\n')
3336+
3337+
def test_set_encoding_newline(self):
3338+
raw = self.BytesIO(b'CR\rEOF')
3339+
txt = self.TextIOWrapper(raw, 'ascii', newline='\n')
3340+
txt.set_encoding(newline=None)
3341+
self.assertEqual(txt.readline(), 'CR\n')
3342+
raw = self.BytesIO(b'CR\rEOF')
3343+
txt = self.TextIOWrapper(raw, 'ascii', newline='\n')
3344+
txt.set_encoding(newline='')
3345+
self.assertEqual(txt.readline(), 'CR\r')
3346+
raw = self.BytesIO(b'CR\rLF\nEOF')
3347+
txt = self.TextIOWrapper(raw, 'ascii', newline='\r')
3348+
txt.set_encoding(newline='\n')
3349+
self.assertEqual(txt.readline(), 'CR\rLF\n')
3350+
raw = self.BytesIO(b'LF\nCR\rEOF')
3351+
txt = self.TextIOWrapper(raw, 'ascii', newline='\n')
3352+
txt.set_encoding(newline='\r')
3353+
self.assertEqual(txt.readline(), 'LF\nCR\r')
3354+
raw = self.BytesIO(b'CR\rCRLF\r\nEOF')
3355+
txt = self.TextIOWrapper(raw, 'ascii', newline='\r')
3356+
txt.set_encoding(newline='\r\n')
3357+
self.assertEqual(txt.readline(), 'CR\rCRLF\r\n')
3358+
3359+
txt = self.TextIOWrapper(self.BytesIO(), 'ascii', newline='\r')
3360+
txt.set_encoding(newline=None)
3361+
txt.write('linesep\n')
3362+
txt.set_encoding(newline='')
3363+
txt.write('LF\n')
3364+
txt.set_encoding(newline='\n')
3365+
txt.write('LF\n')
3366+
txt.set_encoding(newline='\r')
3367+
txt.write('CR\n')
3368+
txt.set_encoding(newline='\r\n')
3369+
txt.write('CRLF\n')
3370+
expected = 'linesep' + os.linesep + 'LF\nLF\nCR\rCRLF\r\n'
3371+
self.assertEqual(txt.detach().getvalue().decode('ascii'), expected)
3372+
32453373

32463374
class MemviewBytesIO(io.BytesIO):
32473375
'''A BytesIO object whose read method returns memoryviews

0 commit comments

Comments
 (0)