diff --git a/lib/rdoc/encoding.rb b/lib/rdoc/encoding.rb index 54ecd89816..dac4c36f57 100644 --- a/lib/rdoc/encoding.rb +++ b/lib/rdoc/encoding.rb @@ -7,6 +7,18 @@ module RDoc::Encoding + HEADER_REGEXP = /^ + (?: + \A\#!.*\n + | + ^\#\s+frozen[-_]string[-_]literal[=:].+\n + | + ^\#[^\n]+\b(?:en)?coding[=:]\s*(?[^\s;]+).*\n + | + <\?xml[^?]*encoding=(?["'])(?.*?)\k.*\n + )+ + /xi # :nodoc: + ## # Reads the contents of +filename+ and handles any encoding directives in # the file. @@ -23,7 +35,8 @@ def self.read_file filename, encoding, force_transcode = false utf8 = content.sub!(/\A\xef\xbb\xbf/, '') - content = RDoc::Encoding.set_encoding content + enc = RDoc::Encoding.detect_encoding content + content = RDoc::Encoding.change_encoding content, enc if enc begin encoding ||= Encoding.default_external @@ -85,29 +98,22 @@ def self.remove_frozen_string_literal string end ## - # Sets the encoding of +string+ based on the magic comment - - def self.set_encoding string - string = remove_frozen_string_literal string - - string =~ /\A(?:#!.*\n)?(.*\n)/ - - first_line = $1 + # Detects the encoding of +string+ based on the magic comment - name = case first_line - when /^<\?xml[^?]*encoding=(["'])(.*?)\1/ then $2 - when /\b(?:en)?coding[=:]\s*([^\s;]+)/i then $1 - else return string - end + def self.detect_encoding string + result = HEADER_REGEXP.match string + name = result && result[:name] - string = string.sub first_line, '' - - string = remove_frozen_string_literal string + name ? Encoding.find(name) : nil + end - enc = Encoding.find name - string = RDoc::Encoding.change_encoding string, enc if enc + ## + # Removes magic comments and shebang - string + def self.remove_magic_comment string + string.sub HEADER_REGEXP do |s| + s.gsub(/[^\n]/, '') + end end ## diff --git a/lib/rdoc/markup/pre_process.rb b/lib/rdoc/markup/pre_process.rb index 0ac7a41934..d9e0dcac14 100644 --- a/lib/rdoc/markup/pre_process.rb +++ b/lib/rdoc/markup/pre_process.rb @@ -266,6 +266,7 @@ def include_file name, indent, encoding end content = RDoc::Encoding.read_file full_name, encoding, true + content = RDoc::Encoding.remove_magic_comment content # strip magic comment content = content.sub(/\A# .*coding[=:].*$/, '').lstrip diff --git a/lib/rdoc/parser/ruby.rb b/lib/rdoc/parser/ruby.rb index 8599f655ad..183156baf4 100644 --- a/lib/rdoc/parser/ruby.rb +++ b/lib/rdoc/parser/ruby.rb @@ -177,6 +177,7 @@ def initialize(top_level, file_name, content, options, stats) @size = 0 @token_listeners = nil + content = RDoc::Encoding.remove_magic_comment content @scanner = RDoc::RipperStateLex.parse(content) @content = content @scanner_point = 0 diff --git a/test/test_rdoc_encoding.rb b/test/test_rdoc_encoding.rb index 9dc20990a0..572cfcda47 100644 --- a/test/test_rdoc_encoding.rb +++ b/test/test_rdoc_encoding.rb @@ -31,7 +31,7 @@ def test_class_read_file_encoding @tempfile.flush contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - assert_equal "hi everybody", contents + assert_equal "# coding: utf-8\nhi everybody", contents assert_equal Encoding::UTF_8, contents.encoding end @@ -45,7 +45,7 @@ def test_class_read_file_encoding_convert contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 assert_equal Encoding::UTF_8, contents.encoding - assert_equal "hi \u00e9verybody", contents.sub("\r", '') + assert_equal "# coding: ISO-8859-1\nhi \u00e9verybody", contents.sub("\r", '') end def test_class_read_file_encoding_fail @@ -71,7 +71,7 @@ def test_class_read_file_encoding_fancy @tempfile.flush contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - assert_equal "hi everybody", contents + assert_equal "# -*- coding: utf-8; fill-column: 74 -*-\nhi everybody", contents assert_equal Encoding::UTF_8, contents.encoding end @@ -81,7 +81,7 @@ def test_class_read_file_encoding_force_transcode contents = RDoc::Encoding.read_file @tempfile.path, Encoding::US_ASCII, true - assert_equal '?', contents + assert_equal "# coding: utf-8\n?", contents assert_equal Encoding::US_ASCII, contents.encoding end @@ -124,108 +124,58 @@ def test_class_read_file_encoding_iso_2022_jp contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - expected = ":\xe3\x82\xb3\xe3\x83\x9e\xe3\x83\xb3\xe3\x83\x89:" + expected = "# coding: ISO-2022-JP\n:\xe3\x82\xb3\xe3\x83\x9e\xe3\x83\xb3\xe3\x83\x89:" expected = RDoc::Encoding.change_encoding expected, Encoding::UTF_8 assert_equal expected, contents assert_equal Encoding::UTF_8, contents.encoding end - def test_class_set_encoding + def test_class_detect_encoding s = "# coding: UTF-8\n" - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s # sanity check for 1.8 - assert_equal Encoding::UTF_8, s.encoding + assert_equal Encoding::UTF_8, encoding s = "#!/bin/ruby\n# coding: UTF-8\n" - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal Encoding::UTF_8, s.encoding + assert_equal Encoding::UTF_8, encoding s = "\n" - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal Encoding::UTF_8, s.encoding + assert_equal Encoding::UTF_8, encoding s = "\n" - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal Encoding::UTF_8, s.encoding - end - - def test_class_set_encoding_strip - s = "# coding: UTF-8\n# more comments" - - s = RDoc::Encoding.set_encoding s - - assert_equal "# more comments", s - - s = "#!/bin/ruby\n# coding: UTF-8\n# more comments" - - s = RDoc::Encoding.set_encoding s - - assert_equal "#!/bin/ruby\n# more comments", s + assert_equal Encoding::UTF_8, encoding end def test_class_set_encoding_bad s = "" - expected = s.encoding - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal expected, s.encoding + assert_equal nil, encoding s = "# vim:set fileencoding=utf-8:\n" - expected = s.encoding - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal expected, s.encoding + assert_equal nil, encoding s = "# vim:set fileencoding=utf-8:\n" - expected = s.encoding - s = RDoc::Encoding.set_encoding s + encoding = RDoc::Encoding.detect_encoding s - assert_equal expected, s.encoding + assert_equal nil, encoding assert_raises ArgumentError do - s = RDoc::Encoding.set_encoding "# -*- encoding: undecided -*-\n" + s = RDoc::Encoding.detect_encoding "# -*- encoding: undecided -*-\n" end end - def test_skip_frozen_string_literal - expected = "# frozen_string_literal: true\nhi everybody" - - @tempfile.write expected - @tempfile.flush - - contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - assert_equal "hi everybody", contents - assert_equal Encoding::UTF_8, contents.encoding - end - - def test_skip_frozen_string_literal_after_coding - expected = "# coding: utf-8\n# frozen-string-literal: false\nhi everybody" - - @tempfile.write expected - @tempfile.flush - - contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - assert_equal "hi everybody", contents - assert_equal Encoding::UTF_8, contents.encoding - end - - def test_skip_frozen_string_literal_before_coding - expected = "# frozen_string_literal: true\n# coding: utf-8\nhi everybody" - - @tempfile.write expected - @tempfile.flush - - contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8 - assert_equal "hi everybody", contents - assert_equal Encoding::UTF_8, contents.encoding - end - def test_sanity assert_equal Encoding::US_ASCII, ''.encoding, 'If this file is not ASCII tests may incorrectly pass'