rust-lang · huonw · Feb 23, 2014 · Feb 23, 2014
diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs
@@ -460,11 +460,12 @@ impl CodeMap {
         for mbc in multibyte_chars.get().iter() {
             debug!("codemap: {:?}-byte char at {:?}", mbc.bytes, mbc.pos);
             if mbc.pos < bpos {
-                total_extra_bytes += mbc.bytes;
+                // every character is at least one byte, so we only
+                // count the actual extra bytes.
+                total_extra_bytes += mbc.bytes - 1;
                 // We should never see a byte position in the middle of a
                 // character
-                assert!(bpos == mbc.pos ||
-                        bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
+                assert!(bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
             } else {
                 break;
             }

diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs
@@ -264,8 +264,7 @@ pub fn bump(rdr: &StringReader) {
         }
 
         if byte_offset_diff > 1 {
-            rdr.filemap.record_multibyte_char(
-                Pos::from_uint(current_byte_offset), byte_offset_diff);
+            rdr.filemap.record_multibyte_char(rdr.last_pos.get(), byte_offset_diff);
         }
     } else {
         rdr.curr.set(None);

diff --git a/src/test/run-make/unicode-input/Makefile b/src/test/run-make/unicode-input/Makefile
@@ -0,0 +1,11 @@
+-include ../tools.mk
+
+all:
+	# check that we don't ICE on unicode input, issue #11178
+	$(RUSTC) multiple_files.rs
+	$(call RUN,multiple_files)  "$(RUSTC)" "$(TMPDIR)"
+
+	# check that our multibyte-ident spans are (approximately) the
+	# correct length. issue #8706
+	$(RUSTC) span_length.rs
+	$(call RUN,span_length) "$(RUSTC)" "$(TMPDIR)"
diff --git a/src/test/run-make/unicode-input/multiple_files.rs b/src/test/run-make/unicode-input/multiple_files.rs
@@ -0,0 +1,65 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::{char, os, run, str};
+use std::rand::{task_rng, Rng};
+use std::io::File;
+
+// creates unicode_input_multiple_files_{main,chars}.rs, where the
+// former imports the latter. `_chars` just contains an indentifier
+// made up of random characters, because will emit an error message
+// about the ident being in the wrong place, with a span (and creating
+// this span used to upset the compiler).
+
+fn random_char() -> char {
+    let mut rng = task_rng();
+    // a subset of the XID_start unicode table (ensuring that the
+    // compiler doesn't fail with an "unrecognised token" error)
+    let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
+        1 => (0x41, 0x5a),
+        2 => (0xf8, 0x1ba),
+        3 => (0x1401, 0x166c),
+        _ => (0x10400, 0x1044f)
+    };
+
+    char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
+}
+
+fn main() {
+    let args = os::args();
+    let rustc = args[1].as_slice();
+    let tmpdir = Path::new(args[2].as_slice());
+
+    let main_file = tmpdir.join("unicode_input_multiple_files_main.rs");
+    let main_file_str = main_file.as_str().unwrap();
+    {
+        let _ = File::create(&main_file).unwrap()
+            .write_str("mod unicode_input_multiple_files_chars;");
+    }
+
+    for _ in range(0, 100) {
+        {
+            let randoms = tmpdir.join("unicode_input_multiple_files_chars.rs");
+            let mut w = File::create(&randoms).unwrap();
+            for _ in range(0, 30) {
+                let _ = w.write_char(random_char());
+            }
+        }
+
+        // rustc is passed to us with --out-dir and -L etc., so we
+        // can't exec it directly
+        let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
+        let err = str::from_utf8_lossy(result.error);
+
+        // positive test so that this test will be updated when the
+        // compiler changes.
+        assert!(err.as_slice().contains("expected item but found"))
+    }
+}
diff --git a/src/test/run-make/unicode-input/span_length.rs b/src/test/run-make/unicode-input/span_length.rs
@@ -0,0 +1,62 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::{char, os, run, str};
+use std::rand::{task_rng, Rng};
+use std::io::File;
+
+// creates a file with `fn main() { <random ident> }` and checks the
+// compiler emits a span of the appropriate length (for the
+// "unresolved name" message); currently just using the number of code
+// points, but should be the number of graphemes (FIXME #7043)
+
+fn random_char() -> char {
+    let mut rng = task_rng();
+    // a subset of the XID_start unicode table (ensuring that the
+    // compiler doesn't fail with an "unrecognised token" error)
+    let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
+        1 => (0x41, 0x5a),
+        2 => (0xf8, 0x1ba),
+        3 => (0x1401, 0x166c),
+        _ => (0x10400, 0x1044f)
+    };
+
+    char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
+}
+
+fn main() {
+    let args = os::args();
+    let rustc = args[1].as_slice();
+    let tmpdir = Path::new(args[2].as_slice());
+
+    let main_file = tmpdir.join("span_main.rs");
+    let main_file_str = main_file.as_str().unwrap();
+
+    for _ in range(0, 100) {
+        let n = task_rng().gen_range(3u, 20);
+
+        {
+            let _ = write!(&mut File::create(&main_file).unwrap(),
+                           r"\#[feature(non_ascii_idents)]; fn main() \{ {} \}",
+                           // random string of length n
+                           range(0, n).map(|_| random_char()).collect::<~str>());
+        }
+
+        // rustc is passed to us with --out-dir and -L etc., so we
+        // can't exec it directly
+        let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
+
+        let err = str::from_utf8_lossy(result.error);
+
+        // the span should end the line (e.g no extra ~'s)
+        let expected_span = "^" + "~".repeat(n - 1) + "\n";
+        assert!(err.as_slice().contains(expected_span));
+    }
+}