ruby · aycabta · Jan 13, 2018 · Jan 1, 2018 · Jan 10, 2018
diff --git a/lib/rdoc/encoding.rb b/lib/rdoc/encoding.rb
@@ -7,6 +7,18 @@
 
 module RDoc::Encoding
 
+  HEADER_REGEXP = /^
+    (?:
+      \A\#!.*\n
+      |
+      ^\#\s+frozen[-_]string[-_]literal[=:].+\n
+      |
+      ^\#[^\n]+\b(?:en)?coding[=:]\s*(?<name>[^\s;]+).*\n
+      |
+      <\?xml[^?]*encoding=(?<quote>["'])(?<name>.*?)\k<quote>.*\n
+    )+
+  /xi # :nodoc:
+
   ##
   # Reads the contents of +filename+ and handles any encoding directives in
   # the file.
@@ -23,7 +35,8 @@ def self.read_file filename, encoding, force_transcode = false
 
     utf8 = content.sub!(/\A\xef\xbb\xbf/, '')
 
-    content = RDoc::Encoding.set_encoding content
+    enc = RDoc::Encoding.detect_encoding content
+    content = RDoc::Encoding.change_encoding content, enc if enc
 
     begin
       encoding ||= Encoding.default_external
@@ -85,29 +98,22 @@ def self.remove_frozen_string_literal string
   end
 
   ##
-  # Sets the encoding of +string+ based on the magic comment
-
-  def self.set_encoding string
-    string = remove_frozen_string_literal string
-
-    string =~ /\A(?:#!.*\n)?(.*\n)/
-
-    first_line = $1
+  # Detects the encoding of +string+ based on the magic comment
 
-    name = case first_line
-           when /^<\?xml[^?]*encoding=(["'])(.*?)\1/ then $2
-           when /\b(?:en)?coding[=:]\s*([^\s;]+)/i   then $1
-           else                                           return string
-           end
+  def self.detect_encoding string
+    result = HEADER_REGEXP.match string
+    name = result && result[:name]
 
-    string = string.sub first_line, ''
-
-    string = remove_frozen_string_literal string
+    name ? Encoding.find(name) : nil
+  end
 
-    enc = Encoding.find name
-    string = RDoc::Encoding.change_encoding string, enc if enc
+  ##
+  # Removes magic comments and shebang
 
-    string
+  def self.remove_magic_comment string
+    string.sub HEADER_REGEXP do |s|
+      s.gsub(/[^\n]/, '')
+    end
   end
 
   ##

diff --git a/lib/rdoc/markup/pre_process.rb b/lib/rdoc/markup/pre_process.rb
@@ -266,6 +266,7 @@ def include_file name, indent, encoding
     end
 
     content = RDoc::Encoding.read_file full_name, encoding, true
+    content = RDoc::Encoding.remove_magic_comment content
 
     # strip magic comment
     content = content.sub(/\A# .*coding[=:].*$/, '').lstrip

diff --git a/lib/rdoc/parser/ruby.rb b/lib/rdoc/parser/ruby.rb
@@ -177,6 +177,7 @@ def initialize(top_level, file_name, content, options, stats)
 
     @size = 0
     @token_listeners = nil
+    content = RDoc::Encoding.remove_magic_comment content
     @scanner = RDoc::RipperStateLex.parse(content)
     @content = content
     @scanner_point = 0

diff --git a/test/test_rdoc_encoding.rb b/test/test_rdoc_encoding.rb
@@ -31,7 +31,7 @@ def test_class_read_file_encoding
     @tempfile.flush
 
     contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
-    assert_equal "hi everybody", contents
+    assert_equal "# coding: utf-8\nhi everybody", contents
     assert_equal Encoding::UTF_8, contents.encoding
   end
 
@@ -45,7 +45,7 @@ def test_class_read_file_encoding_convert
 
     contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
     assert_equal Encoding::UTF_8, contents.encoding
-    assert_equal "hi \u00e9verybody", contents.sub("\r", '')
+    assert_equal "# coding: ISO-8859-1\nhi \u00e9verybody", contents.sub("\r", '')
   end
 
   def test_class_read_file_encoding_fail
@@ -71,7 +71,7 @@ def test_class_read_file_encoding_fancy
     @tempfile.flush
 
     contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
-    assert_equal "hi everybody", contents
+    assert_equal "# -*- coding: utf-8; fill-column: 74 -*-\nhi everybody", contents
     assert_equal Encoding::UTF_8, contents.encoding
   end
 
@@ -81,7 +81,7 @@ def test_class_read_file_encoding_force_transcode
 
     contents = RDoc::Encoding.read_file @tempfile.path, Encoding::US_ASCII, true
 
-    assert_equal '?', contents
+    assert_equal "# coding: utf-8\n?", contents
     assert_equal Encoding::US_ASCII, contents.encoding
   end
 
@@ -124,108 +124,58 @@ def test_class_read_file_encoding_iso_2022_jp
 
     contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
 
-    expected = ":\xe3\x82\xb3\xe3\x83\x9e\xe3\x83\xb3\xe3\x83\x89:"
+    expected = "# coding: ISO-2022-JP\n:\xe3\x82\xb3\xe3\x83\x9e\xe3\x83\xb3\xe3\x83\x89:"
     expected = RDoc::Encoding.change_encoding expected, Encoding::UTF_8
 
     assert_equal expected, contents
     assert_equal Encoding::UTF_8, contents.encoding
   end
 
-  def test_class_set_encoding
+  def test_class_detect_encoding
     s = "# coding: UTF-8\n"
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
     # sanity check for 1.8
 
-    assert_equal Encoding::UTF_8, s.encoding
+    assert_equal Encoding::UTF_8, encoding
 
     s = "#!/bin/ruby\n# coding: UTF-8\n"
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal Encoding::UTF_8, s.encoding
+    assert_equal Encoding::UTF_8, encoding
 
     s = "<?xml version='1.0' encoding='UTF-8'?>\n"
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal Encoding::UTF_8, s.encoding
+    assert_equal Encoding::UTF_8, encoding
 
     s = "<?xml version='1.0' encoding=\"UTF-8\"?>\n"
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal Encoding::UTF_8, s.encoding
-  end
-
-  def test_class_set_encoding_strip
-    s = "# coding: UTF-8\n# more comments"
-
-    s = RDoc::Encoding.set_encoding s
-
-    assert_equal "# more comments", s
-
-    s = "#!/bin/ruby\n# coding: UTF-8\n# more comments"
-
-    s = RDoc::Encoding.set_encoding s
-
-    assert_equal "#!/bin/ruby\n# more comments", s
+    assert_equal Encoding::UTF_8, encoding
   end
 
   def test_class_set_encoding_bad
     s = ""
-    expected = s.encoding
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal expected, s.encoding
+    assert_equal nil, encoding
 
     s = "# vim:set fileencoding=utf-8:\n"
-    expected = s.encoding
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal expected, s.encoding
+    assert_equal nil, encoding
 
     s = "# vim:set fileencoding=utf-8:\n"
-    expected = s.encoding
-    s = RDoc::Encoding.set_encoding s
+    encoding = RDoc::Encoding.detect_encoding s
 
-    assert_equal expected, s.encoding
+    assert_equal nil, encoding
 
     assert_raises ArgumentError do
-      s = RDoc::Encoding.set_encoding "# -*- encoding: undecided -*-\n"
+      s = RDoc::Encoding.detect_encoding "# -*- encoding: undecided -*-\n"
     end
   end
 
-  def test_skip_frozen_string_literal
-    expected = "# frozen_string_literal: true\nhi everybody"
-
-    @tempfile.write expected
-    @tempfile.flush
-
-    contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
-    assert_equal "hi everybody", contents
-    assert_equal Encoding::UTF_8, contents.encoding
-  end
-
-  def test_skip_frozen_string_literal_after_coding
-    expected = "# coding: utf-8\n# frozen-string-literal: false\nhi everybody"
-
-    @tempfile.write expected
-    @tempfile.flush
-
-    contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
-    assert_equal "hi everybody", contents
-    assert_equal Encoding::UTF_8, contents.encoding
-  end
-
-  def test_skip_frozen_string_literal_before_coding
-    expected = "# frozen_string_literal: true\n# coding: utf-8\nhi everybody"
-
-    @tempfile.write expected
-    @tempfile.flush
-
-    contents = RDoc::Encoding.read_file @tempfile.path, Encoding::UTF_8
-    assert_equal "hi everybody", contents
-    assert_equal Encoding::UTF_8, contents.encoding
-  end
-
   def test_sanity
     assert_equal Encoding::US_ASCII, ''.encoding,
                  'If this file is not ASCII tests may incorrectly pass'