diff --git a/lib/ClangImporter/SortedCFDatabase.def.gyb b/lib/ClangImporter/SortedCFDatabase.def.gyb index 0cfa84e9c9f05..73d74ca26fdbd 100644 --- a/lib/ClangImporter/SortedCFDatabase.def.gyb +++ b/lib/ClangImporter/SortedCFDatabase.def.gyb @@ -17,6 +17,8 @@ %{ import re +import sys +import codecs prologueLines = "" epilogueLines = "" @@ -26,7 +28,7 @@ epilogueLines = "" lineForName = {} # Load the data file. -with open(CFDatabaseFile, 'rb') as f: +with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: # Pass through preprocessor directives literally. # Assume that they all fall into either a strict prologue or epilogue. diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py index a4f76c65cc7a6..338185af5c8f0 100644 --- a/utils/GYBUnicodeDataUtils.py +++ b/utils/GYBUnicodeDataUtils.py @@ -11,6 +11,8 @@ ##===----------------------------------------------------------------------===## import re +import sys +import codecs class UnicodeProperty(object): """Abstract base class for Unicode properties.""" @@ -64,11 +66,11 @@ def __init__(self, grapheme_break_property_file_name): # values to symbolic values. self.symbolic_values = \ [ None ] * (max(self.numeric_value_table.values()) + 1) - for k,v in self.numeric_value_table.iteritems(): + for k,v in self.numeric_value_table.items(): self.symbolic_values[v] = k # Load the data file. - with open(grapheme_break_property_file_name, 'rb') as f: + with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: # Strip comments. line = re.sub('#.*', '', line) @@ -329,7 +331,10 @@ def map_index(idx): else: return idx - return map(map_index, indexes) + # NOTE: Python 2's `map` function returns a list. Where Python 3's + # `map` function returns an iterator. To work around this the + # result of the `map` is explicitly converted to a `list`. + return list(map(map_index, indexes)) # If self.BMP_data contains identical data blocks, keep the first one, # remove duplicates and change the indexes in self.BMP_lookup to point to @@ -514,9 +519,9 @@ def _convert_line(line): # Match a list of code points. for token in line.split(" "): - if token == "÷": + if token == u"÷": boundaries += [ curr_bytes ] - elif token == "×": + elif token == u"×": pass else: code_point = int(token, 16) @@ -529,21 +534,21 @@ def _convert_line(line): # and test separately that we handle ill-formed UTF-8 sequences. if code_point >= 0xd800 and code_point <= 0xdfff: code_point = 0x200b - code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape') - as_UTF8_bytes = code_point.encode('utf8') - as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes]) + code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict') + as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict')) + as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes]) test += as_UTF8_escaped curr_bytes += len(as_UTF8_bytes) return (test, boundaries) # Self-test. - assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ])) - assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ])) + assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ])) + assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ])) result = [] - with open(grapheme_break_test_file_name, 'rb') as f: + with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f: for line in f: test = _convert_line(line) if test: diff --git a/utils/gyb.py b/utils/gyb.py index 0d123175c4c48..ba0d76ce1f1a6 100755 --- a/utils/gyb.py +++ b/utils/gyb.py @@ -5,7 +5,10 @@ from __future__ import print_function import re -from cStringIO import StringIO +try: + from cStringIO import StringIO +except ImportError: + from io import StringIO import tokenize import textwrap from bisect import bisect @@ -135,7 +138,8 @@ def tokenizePythonToUnmatchedCloseCurly(sourceText, start, lineStarts): if nesting < 0: return tokenPosToIndex(tokenStart, start, lineStarts) - except tokenize.TokenError, (message, errorPos): + except tokenize.TokenError as error: + (message, errorPos) = error.args return tokenPosToIndex(errorPos, start, lineStarts) return len(sourceText) @@ -304,7 +308,7 @@ def splitGybLines(sourceLines): dedents = 0 try: for tokenKind, tokenText, tokenStart, (tokenEndLine, tokenEndCol), lineText \ - in tokenize.generate_tokens(sourceLines.__iter__().next): + in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)): if tokenKind in (tokenize.COMMENT, tokenize.ENDMARKER): continue @@ -324,7 +328,7 @@ def splitGybLines(sourceLines): lastTokenText,lastTokenKind = tokenText,tokenKind - except tokenize.TokenError, (message, errorPos): + except tokenize.TokenError: return [] # Let the later compile() call report the error if lastTokenText == ':': @@ -347,7 +351,7 @@ def codeStartsWithDedentKeyword(sourceLines): """ tokenText = None for tokenKind, tokenText, _, _, _ \ - in tokenize.generate_tokens(sourceLines.__iter__().next): + in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)): if tokenKind != tokenize.COMMENT and tokenText.strip() != '': break diff --git a/utils/line-directive b/utils/line-directive index 7f147f815063d..2abcbbda6d42f 100755 --- a/utils/line-directive +++ b/utils/line-directive @@ -71,7 +71,10 @@ def run(): sources = sys.argv[1:dashes] command = subprocess.Popen( - sys.argv[dashes + 1:], stderr = subprocess.STDOUT, stdout = subprocess.PIPE + sys.argv[dashes + 1:], + stderr = subprocess.STDOUT, + stdout = subprocess.PIPE, + universal_newlines = True ) error_pattern = re.compile(