Skip to content

Handle multibyte characters in source files better #12489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/libsyntax/codemap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,12 @@ impl CodeMap {
for mbc in multibyte_chars.get().iter() {
debug!("codemap: {:?}-byte char at {:?}", mbc.bytes, mbc.pos);
if mbc.pos < bpos {
total_extra_bytes += mbc.bytes;
// every character is at least one byte, so we only
// count the actual extra bytes.
total_extra_bytes += mbc.bytes - 1;
// We should never see a byte position in the middle of a
// character
assert!(bpos == mbc.pos ||
bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
assert!(bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
} else {
break;
}
Expand Down
3 changes: 1 addition & 2 deletions src/libsyntax/parse/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,7 @@ pub fn bump(rdr: &StringReader) {
}

if byte_offset_diff > 1 {
rdr.filemap.record_multibyte_char(
Pos::from_uint(current_byte_offset), byte_offset_diff);
rdr.filemap.record_multibyte_char(rdr.last_pos.get(), byte_offset_diff);
}
} else {
rdr.curr.set(None);
Expand Down
11 changes: 11 additions & 0 deletions src/test/run-make/unicode-input/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
-include ../tools.mk

all:
# check that we don't ICE on unicode input, issue #11178
$(RUSTC) multiple_files.rs
$(call RUN,multiple_files) "$(RUSTC)" "$(TMPDIR)"

# check that our multibyte-ident spans are (approximately) the
# correct length. issue #8706
$(RUSTC) span_length.rs
$(call RUN,span_length) "$(RUSTC)" "$(TMPDIR)"
65 changes: 65 additions & 0 deletions src/test/run-make/unicode-input/multiple_files.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::{char, os, run, str};
use std::rand::{task_rng, Rng};
use std::io::File;

// creates unicode_input_multiple_files_{main,chars}.rs, where the
// former imports the latter. `_chars` just contains an indentifier
// made up of random characters, because will emit an error message
// about the ident being in the wrong place, with a span (and creating
// this span used to upset the compiler).

fn random_char() -> char {
let mut rng = task_rng();
// a subset of the XID_start unicode table (ensuring that the
// compiler doesn't fail with an "unrecognised token" error)
let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
1 => (0x41, 0x5a),
2 => (0xf8, 0x1ba),
3 => (0x1401, 0x166c),
_ => (0x10400, 0x1044f)
};

char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
}

fn main() {
let args = os::args();
let rustc = args[1].as_slice();
let tmpdir = Path::new(args[2].as_slice());

let main_file = tmpdir.join("unicode_input_multiple_files_main.rs");
let main_file_str = main_file.as_str().unwrap();
{
let _ = File::create(&main_file).unwrap()
.write_str("mod unicode_input_multiple_files_chars;");
}

for _ in range(0, 100) {
{
let randoms = tmpdir.join("unicode_input_multiple_files_chars.rs");
let mut w = File::create(&randoms).unwrap();
for _ in range(0, 30) {
let _ = w.write_char(random_char());
}
}

// rustc is passed to us with --out-dir and -L etc., so we
// can't exec it directly
let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
let err = str::from_utf8_lossy(result.error);

// positive test so that this test will be updated when the
// compiler changes.
assert!(err.as_slice().contains("expected item but found"))
}
}
62 changes: 62 additions & 0 deletions src/test/run-make/unicode-input/span_length.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::{char, os, run, str};
use std::rand::{task_rng, Rng};
use std::io::File;

// creates a file with `fn main() { <random ident> }` and checks the
// compiler emits a span of the appropriate length (for the
// "unresolved name" message); currently just using the number of code
// points, but should be the number of graphemes (FIXME #7043)

fn random_char() -> char {
let mut rng = task_rng();
// a subset of the XID_start unicode table (ensuring that the
// compiler doesn't fail with an "unrecognised token" error)
let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
1 => (0x41, 0x5a),
2 => (0xf8, 0x1ba),
3 => (0x1401, 0x166c),
_ => (0x10400, 0x1044f)
};

char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
}

fn main() {
let args = os::args();
let rustc = args[1].as_slice();
let tmpdir = Path::new(args[2].as_slice());

let main_file = tmpdir.join("span_main.rs");
let main_file_str = main_file.as_str().unwrap();

for _ in range(0, 100) {
let n = task_rng().gen_range(3u, 20);

{
let _ = write!(&mut File::create(&main_file).unwrap(),
r"\#[feature(non_ascii_idents)]; fn main() \{ {} \}",
// random string of length n
range(0, n).map(|_| random_char()).collect::<~str>());
}

// rustc is passed to us with --out-dir and -L etc., so we
// can't exec it directly
let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();

let err = str::from_utf8_lossy(result.error);

// the span should end the line (e.g no extra ~'s)
let expected_span = "^" + "~".repeat(n - 1) + "\n";
assert!(err.as_slice().contains(expected_span));
}
}