From d4decaebd98acb49088f96ac64df4478ab11ee60 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 3 Mar 2024 21:54:59 -0500 Subject: [PATCH 1/3] Add API to correct defective combining character sequences --- scripts/unicode.py | 115 ++++++++++++++++++++- src/correct_ccs.rs | 177 +++++++++++++++++++++++++++++++++ src/lib.rs | 30 ++++++ src/tables.rs | 63 ++++++++++++ tests/correct_defective_ccs.rs | 29 ++++++ 5 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 src/correct_ccs.rs create mode 100644 tests/correct_defective_ccs.rs diff --git a/scripts/unicode.py b/scripts/unicode.py index f874f16..52b2793 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -19,6 +19,7 @@ # Since this should not require frequent updates, we just store this # out-of-line and check the tables.rs and normalization_tests.rs files into git. import collections +import re import urllib.request from itertools import batched @@ -67,6 +68,8 @@ class UnicodeData(object): def __init__(self): self._load_unicode_data() + self._load_default_ignorable_marks() + self.norm_props = self._load_norm_props() self.norm_tests = self._load_norm_tests() @@ -101,6 +104,11 @@ def _load_unicode_data(self): self.general_category_mark = [] self.general_category_public_assigned = [] + # Characters that cannot be part of a combining character sequence: + # control characters, format characters other than ZWJ and ZWNJ, + # the line and paragraph separators, and noncharacters. + self.not_in_ccs = [] + assigned_start = 0; prev_char_int = -1; prev_name = ""; @@ -126,6 +134,9 @@ def _load_unicode_data(self): if category == 'M' or 'M' in expanded_categories.get(category, []): self.general_category_mark.append(char_int) + if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]: + self.not_in_ccs.append(char_int) + assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt" if category not in ['Co', 'Cs']: if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name): @@ -136,6 +147,44 @@ def _load_unicode_data(self): self.general_category_public_assigned.append((assigned_start, prev_char_int)) + # Mark noncharacters as nongraphic + for i in range(0xFDD0, 0xFDF0): + self.not_in_ccs.append(i) + for prefix in range(0, 0x11): + shifted = prefix << 16 + self.not_in_ccs.append(shifted | 0xFFFE) + self.not_in_ccs.append(shifted | 0xFFFF) + + self.not_in_ccs.sort() + + def _load_default_ignorable_marks(self): + default_ignorable_cps = set() + + single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+") + multiple = re.compile( + r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" + ) + + for line in self._fetch("DerivedCoreProperties.txt").splitlines(): + raw_data = None # (low, high) + if match := single.match(line): + raw_data = (match.group(1), match.group(1)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + for cp in range(low, high + 1): + default_ignorable_cps.add(cp) + + self.default_ignorable_marks = [] + for cp in self.general_category_mark: + if cp in default_ignorable_cps: + self.default_ignorable_marks.append(cp) + + self.default_ignorable_marks.sort() + def _load_cjk_compat_ideograph_variants(self): for line in self._fetch("StandardizedVariants.txt").splitlines(): strip_comments = line.split('#', 1)[0].strip() @@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out): def gen_public_assigned(general_category_public_assigned, out): # This could be done as a hash but the table is somewhat small. - out.write("#[inline]\n") + out.write("\n#[inline]\n") out.write("pub fn is_public_assigned(c: char) -> bool {\n") out.write(" match c {\n") @@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out): out.write(" }\n") out.write("}\n") +def gen_not_in_ccs(not_in_ccs, out): + # List of codepoints to list of ranges + range_list = [] + for cp in not_in_ccs: + if len(range_list) != 0 and range_list[-1][1] == cp - 1: + range_list[-1] = (range_list[-1][0], cp) + else: + range_list.append((cp, cp)) + + out.write("\n#[inline]\n") + out.write("pub fn not_in_ccs(c: char) -> bool {\n") + out.write(" match c {\n") + + start = True + for first, last in range_list: + if start: + out.write(" ") + start = False + else: + out.write("\n | ") + if first == last: + out.write("'\\u{%s}'" % hexify(first)) + else: + out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) + out.write(" => true,\n") + + out.write(" _ => false,\n") + out.write(" }\n") + out.write("}\n") + +def gen_default_ignorable_mark(default_ignorable_marks, out): + # List of codepoints to list of ranges + range_list = [] + for cp in default_ignorable_marks: + if len(range_list) != 0 and range_list[-1][1] == cp - 1: + range_list[-1] = (range_list[-1][0], cp) + else: + range_list.append((cp, cp)) + + out.write("\n#[inline]\n") + out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n") + out.write(" match c {\n") + + start = True + for first, last in range_list: + if start: + out.write(" ") + start = False + else: + out.write("\n | ") + if first == last: + out.write("'\\u{%s}'" % hexify(first)) + else: + out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) + out.write(" => true,\n") + + out.write(" _ => false,\n") + out.write(" }\n") + out.write("}\n") + def gen_stream_safe(leading, trailing, out): # This could be done as a hash but the table is very small. out.write("#[inline]\n") @@ -602,6 +711,10 @@ def minimal_perfect_hash(d): gen_public_assigned(data.general_category_public_assigned, out) + gen_not_in_ccs(data.not_in_ccs, out) + + gen_default_ignorable_mark(data.default_ignorable_marks, out) + gen_nfc_qc(data.norm_props, out) gen_nfkc_qc(data.norm_props, out) diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs new file mode 100644 index 0000000..7395b88 --- /dev/null +++ b/src/correct_ccs.rs @@ -0,0 +1,177 @@ +#[cfg(not(feature = "std"))] +use alloc::collections::VecDeque; +use core::iter::FusedIterator; +#[cfg(feature = "std")] +use std::collections::VecDeque; + +use crate::{lookups, tables}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CcsKind { + /// A CCS base character (graphic character other than combining mark). + Base, + + /// A combining character other than a `Default_Ignorable_Code_Point`. + NonIgnorableCombining, + + /// A default-ignorable combining character, ZWJ, or ZWNJ. + IgnorableCombining, +} + +impl CcsKind { + fn of(c: char) -> Option { + if c == '\u{200C}' || c == '\u{200D}' { + // ZWNJ || ZWJ + Some(CcsKind::IgnorableCombining) + } else if lookups::is_combining_mark(c) { + if tables::is_default_ignorable_mark(c) { + Some(CcsKind::IgnorableCombining) + } else { + Some(CcsKind::NonIgnorableCombining) + } + } else if tables::not_in_ccs(c) { + None + } else { + Some(CcsKind::Base) + } + } +} + +/// An iterator over the string that corrects +/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) +/// by inserting U+00A0 NO-BREAK SPACE in front of them. +/// +/// For the purposes of this iterator, private use characters, +/// as well as unassigned codepoints other than noncharacters, +/// are considered valid base characters, +/// so combining character sequences that start with such will not be modified. +/// +/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s +/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input; +/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]). +#[derive(Clone, Debug)] +pub struct CorrectDefectiveCcs { + /// Whether the last character emitted was part of a CCS. + in_ccs: bool, + buffer: VecDeque>, + /// Whether the last character in `buffer` is part of a CCS. + /// (Updated only when `is_ccs` is set from false to true). + end_of_buffer_in_ccs: bool, + iter: I, +} + +impl> Iterator for CorrectDefectiveCcs { + type Item = char; + + fn next(&mut self) -> Option { + if self.in_ccs { + if let Some(c) = self.buffer.pop_front() { + // Empty buffer + + if self.buffer.is_empty() { + self.in_ccs = self.end_of_buffer_in_ccs; + } + c + } else { + // Forward from inner iterator + + let c = self.iter.next(); + if c.map_or(true, tables::not_in_ccs) { + self.in_ccs = false; + } + c + } + } else { + if self.buffer.is_empty() { + // We don't have a buffer of default ignorable combining characters built up + + let c = self.iter.next()?; + match CcsKind::of(c) { + // Character not in CCS, just forward it + None => return Some(c), + + // Character starts non-defective CCS, + // label ourselves as in CCS and forward it + Some(CcsKind::Base) => { + self.in_ccs = true; + return Some(c); + } + + // Character starts defective CCS and is not default-ignorable. + // Put it in the buffer to emit on next iteration, + // mark ourselves as in CCS, + // and emit NO-BREAK SPACE + Some(CcsKind::NonIgnorableCombining) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + self.buffer.push_back(Some(c)); + return Some('\u{00A0}'); // NO-BREAK SPACE + } + + // Character starts defective CCS and is default-ignorable. + // Put it in the buffer, and fall through to loop below + // to find out whether we emit a NO-BREAK SPACE first. + Some(CcsKind::IgnorableCombining) => { + self.buffer.push_back(Some(c)); + } + } + } + + loop { + // We do have a buffer of default ignorable combining characters built up, + // and we need to figure out whether to emit a NO-BREAK SPACE first. + + let c = self.iter.next(); + match c.and_then(CcsKind::of) { + // Inner iterator yielded character outside CCS (or `None`). + // Emit the built-up buffer with no leading NO-BREAK SPACE. + None => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = false; + let ret = self.buffer.pop_front().unwrap(); + self.buffer.push_back(c); + return ret; + } + + // Inner iterator yielded character that starts a new CCS. + // Emit the built-up buffer with no leading NO-BREAK SPACE. + Some(CcsKind::Base) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + let ret = self.buffer.pop_front().unwrap(); + self.buffer.push_back(c); + return ret; + } + + // Inner iterator yielded non-ignorable combining character. + // Emit the built-up buffer with leading NO-BREAK SPACE. + Some(CcsKind::NonIgnorableCombining) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + self.buffer.push_back(c); + return Some('\u{00A0}'); // NO-BREAK SPACE + } + + // Inner iterator yielded ignorable combining character. + // Add it to the buffer, don't emit anything. + Some(CcsKind::IgnorableCombining) => { + self.buffer.push_back(c); + } + } + } + } + } +} + +impl + FusedIterator> FusedIterator for CorrectDefectiveCcs {} + +impl CorrectDefectiveCcs { + pub(crate) fn new(iter: I) -> Self { + Self { + in_ccs: false, + buffer: VecDeque::new(), + end_of_buffer_in_ccs: false, + iter, + } + } +} diff --git a/src/lib.rs b/src/lib.rs index cc0a850..cc20865 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ extern crate core; extern crate tinyvec; +pub use crate::correct_ccs::CorrectDefectiveCcs; pub use crate::decompose::Decompositions; pub use crate::quick_check::{ is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, @@ -64,6 +65,7 @@ pub use crate::stream_safe::StreamSafe; pub use crate::tables::UNICODE_VERSION; use core::{option, str::Chars}; +mod correct_ccs; mod decompose; mod lookups; mod normalize; @@ -128,6 +130,19 @@ pub trait UnicodeNormalization> { /// An Iterator over the string with Conjoining Grapheme Joiner characters /// inserted according to the Stream-Safe Text Process (UAX15-D4) fn stream_safe(self) -> StreamSafe; + + /// An iterator over the string with + /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) + /// corrected via the insertion of U+00A0 NO-BREAK SPACE. + /// + /// Sequences starting with a private use character or an unassigned codepoint that is not a noncharacter + /// are not corrected. Additionally, combining character sequences consisting entirely of + /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715) + /// are also left untouched. Handling this last case may require the iterator + /// to buffer up to the entire length of the input; + /// this iterator is therefore *not* "stream safe" + /// *even if* used in combination with [`stream_safe()`][UnicodeNormalization::stream_safe]. + fn correct_defective_ccs(self) -> CorrectDefectiveCcs; } impl<'a> UnicodeNormalization> for &'a str { @@ -160,6 +175,11 @@ impl<'a> UnicodeNormalization> for &'a str { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(self.chars()) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs> { + CorrectDefectiveCcs::new(self.chars()) + } } impl UnicodeNormalization> for char { @@ -192,6 +212,11 @@ impl UnicodeNormalization> for char { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(Some(self).into_iter()) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs> { + CorrectDefectiveCcs::new(Some(self).into_iter()) + } } impl> UnicodeNormalization for I { @@ -224,4 +249,9 @@ impl> UnicodeNormalization for I { fn stream_safe(self) -> StreamSafe { StreamSafe::new(self) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs { + CorrectDefectiveCcs::new(self) + } } diff --git a/src/tables.rs b/src/tables.rs index 4b6bf97..0197489 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -20242,6 +20242,7 @@ pub(crate) const COMBINING_MARK_KV: &[u32] = &[ 0xE0193, 0x11D40, 0x005B9, 0x00F7D, 0x16F5A, 0xE0197, 0x009CD, 0x00FB5, 0x1DA07, 0xE01D1, 0x0A880, 0x01A7C, 0x11CA5, 0x009CB, 0x00FB3, 0x00825, 0x1103A, 0x00827, 0x1E94A, 0x008F8, ]; + #[inline] pub fn is_public_assigned(c: char) -> bool { match c { @@ -20954,6 +20955,68 @@ pub fn is_public_assigned(c: char) -> bool { } } +#[inline] +pub fn not_in_ccs(c: char) -> bool { + match c { + '\u{0000}'..='\u{001F}' + | '\u{007F}'..='\u{009F}' + | '\u{00AD}' + | '\u{0600}'..='\u{0605}' + | '\u{061C}' + | '\u{06DD}' + | '\u{070F}' + | '\u{0890}'..='\u{0891}' + | '\u{08E2}' + | '\u{180E}' + | '\u{200B}' + | '\u{200E}'..='\u{200F}' + | '\u{2028}'..='\u{202E}' + | '\u{2060}'..='\u{2064}' + | '\u{2066}'..='\u{206F}' + | '\u{FDD0}'..='\u{FDEF}' + | '\u{FEFF}' + | '\u{FFF9}'..='\u{FFFB}' + | '\u{FFFE}'..='\u{FFFF}' + | '\u{110BD}' + | '\u{110CD}' + | '\u{13430}'..='\u{1343F}' + | '\u{1BCA0}'..='\u{1BCA3}' + | '\u{1D173}'..='\u{1D17A}' + | '\u{1FFFE}'..='\u{1FFFF}' + | '\u{2FFFE}'..='\u{2FFFF}' + | '\u{3FFFE}'..='\u{3FFFF}' + | '\u{4FFFE}'..='\u{4FFFF}' + | '\u{5FFFE}'..='\u{5FFFF}' + | '\u{6FFFE}'..='\u{6FFFF}' + | '\u{7FFFE}'..='\u{7FFFF}' + | '\u{8FFFE}'..='\u{8FFFF}' + | '\u{9FFFE}'..='\u{9FFFF}' + | '\u{AFFFE}'..='\u{AFFFF}' + | '\u{BFFFE}'..='\u{BFFFF}' + | '\u{CFFFE}'..='\u{CFFFF}' + | '\u{DFFFE}'..='\u{DFFFF}' + | '\u{E0001}' + | '\u{E0020}'..='\u{E007F}' + | '\u{EFFFE}'..='\u{EFFFF}' + | '\u{FFFFE}'..='\u{FFFFF}' + | '\u{10FFFE}'..='\u{10FFFF}' => true, + _ => false, + } +} + +#[inline] +pub fn is_default_ignorable_mark(c: char) -> bool { + match c { + '\u{034F}' + | '\u{17B4}'..='\u{17B5}' + | '\u{180B}'..='\u{180D}' + | '\u{180F}' + | '\u{FE00}'..='\u{FE0F}' + | '\u{E0100}'..='\u{E01EF}' => true, + _ => false, + } +} + #[inline] #[allow(ellipsis_inclusive_range_patterns)] pub fn qc_nfc(c: char) -> IsNormalized { diff --git a/tests/correct_defective_ccs.rs b/tests/correct_defective_ccs.rs new file mode 100644 index 0000000..552b712 --- /dev/null +++ b/tests/correct_defective_ccs.rs @@ -0,0 +1,29 @@ +use unicode_normalization::UnicodeNormalization; + +macro_rules! check_ccs { + ($input: expr, $expected_out: expr) => { + assert_eq!( + $input.correct_defective_ccs().collect::(), + $expected_out + ) + }; +} + +#[test] +fn defective_css() { + check_ccs!("", ""); + check_ccs!("abcde", "abcde"); + check_ccs!("a\u{0301}bcde", "a\u{0301}bcde"); + check_ccs!("\u{0301}bcde", "\u{00A0}\u{0301}bcde"); + check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde"); + check_ccs!("\u{200C}bcde", "\u{200C}bcde"); + check_ccs!("\u{180F}bcde", "\u{180F}bcde"); + check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde"); + check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde"); + check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}"); + check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a"); + check_ccs!( + "\u{180F}\u{180F}\u{180F}\u{0301}", + "\u{00A0}\u{180F}\u{180F}\u{180F}\u{0301}" + ); +} From 0b13808b20a0c3332677f7a2dfe629535d9a73e9 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 4 Mar 2024 21:14:54 -0500 Subject: [PATCH 2/3] Add note on `correct_defective_ccs` usage --- src/correct_ccs.rs | 2 +- src/lib.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs index 7395b88..186cebd 100644 --- a/src/correct_ccs.rs +++ b/src/correct_ccs.rs @@ -44,7 +44,7 @@ impl CcsKind { /// For the purposes of this iterator, private use characters, /// as well as unassigned codepoints other than noncharacters, /// are considered valid base characters, -/// so combining character sequences that start with such will not be modified. +/// so combining character sequences that follow such will not be modified. /// /// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s /// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input; diff --git a/src/lib.rs b/src/lib.rs index cc20865..8ab97bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -133,9 +133,13 @@ pub trait UnicodeNormalization> { /// An iterator over the string with /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) - /// corrected via the insertion of U+00A0 NO-BREAK SPACE. + /// corrected via the insertion of U+00A0 NO-BREAK SPACE in front of them. /// - /// Sequences starting with a private use character or an unassigned codepoint that is not a noncharacter + /// This helps ensure that the sequences will be displayed correctly and consistently, + /// with the correct advance width, + /// in diverse contexts (for example, when printed to a terminal). + /// + /// Sequences following a private use character or an unassigned codepoint that is not a noncharacter /// are not corrected. Additionally, combining character sequences consisting entirely of /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715) /// are also left untouched. Handling this last case may require the iterator From 2dbe07f814df2c18e56701f8db839b19a9d74237 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 13 Mar 2024 13:47:15 -0400 Subject: [PATCH 3/3] Don't handle noncharacters differently than other unassigned codepoints --- scripts/unicode.py | 10 +--------- src/correct_ccs.rs | 3 +-- src/lib.rs | 2 +- src/tables.rs | 20 +------------------- tests/correct_defective_ccs.rs | 2 +- 5 files changed, 5 insertions(+), 32 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 52b2793..5f1e8fe 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -106,7 +106,7 @@ def _load_unicode_data(self): # Characters that cannot be part of a combining character sequence: # control characters, format characters other than ZWJ and ZWNJ, - # the line and paragraph separators, and noncharacters. + # and the line and paragraph separators. self.not_in_ccs = [] assigned_start = 0; @@ -147,14 +147,6 @@ def _load_unicode_data(self): self.general_category_public_assigned.append((assigned_start, prev_char_int)) - # Mark noncharacters as nongraphic - for i in range(0xFDD0, 0xFDF0): - self.not_in_ccs.append(i) - for prefix in range(0, 0x11): - shifted = prefix << 16 - self.not_in_ccs.append(shifted | 0xFFFE) - self.not_in_ccs.append(shifted | 0xFFFF) - self.not_in_ccs.sort() def _load_default_ignorable_marks(self): diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs index 186cebd..4619cf9 100644 --- a/src/correct_ccs.rs +++ b/src/correct_ccs.rs @@ -41,8 +41,7 @@ impl CcsKind { /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) /// by inserting U+00A0 NO-BREAK SPACE in front of them. /// -/// For the purposes of this iterator, private use characters, -/// as well as unassigned codepoints other than noncharacters, +/// For the purposes of this iterator, private use characters and unassigned codepoints /// are considered valid base characters, /// so combining character sequences that follow such will not be modified. /// diff --git a/src/lib.rs b/src/lib.rs index 8ab97bd..a924a46 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -139,7 +139,7 @@ pub trait UnicodeNormalization> { /// with the correct advance width, /// in diverse contexts (for example, when printed to a terminal). /// - /// Sequences following a private use character or an unassigned codepoint that is not a noncharacter + /// Sequences following a private use character or an unassigned codepoint /// are not corrected. Additionally, combining character sequences consisting entirely of /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715) /// are also left untouched. Handling this last case may require the iterator diff --git a/src/tables.rs b/src/tables.rs index 0197489..ac58cea 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -20973,33 +20973,15 @@ pub fn not_in_ccs(c: char) -> bool { | '\u{2028}'..='\u{202E}' | '\u{2060}'..='\u{2064}' | '\u{2066}'..='\u{206F}' - | '\u{FDD0}'..='\u{FDEF}' | '\u{FEFF}' | '\u{FFF9}'..='\u{FFFB}' - | '\u{FFFE}'..='\u{FFFF}' | '\u{110BD}' | '\u{110CD}' | '\u{13430}'..='\u{1343F}' | '\u{1BCA0}'..='\u{1BCA3}' | '\u{1D173}'..='\u{1D17A}' - | '\u{1FFFE}'..='\u{1FFFF}' - | '\u{2FFFE}'..='\u{2FFFF}' - | '\u{3FFFE}'..='\u{3FFFF}' - | '\u{4FFFE}'..='\u{4FFFF}' - | '\u{5FFFE}'..='\u{5FFFF}' - | '\u{6FFFE}'..='\u{6FFFF}' - | '\u{7FFFE}'..='\u{7FFFF}' - | '\u{8FFFE}'..='\u{8FFFF}' - | '\u{9FFFE}'..='\u{9FFFF}' - | '\u{AFFFE}'..='\u{AFFFF}' - | '\u{BFFFE}'..='\u{BFFFF}' - | '\u{CFFFE}'..='\u{CFFFF}' - | '\u{DFFFE}'..='\u{DFFFF}' | '\u{E0001}' - | '\u{E0020}'..='\u{E007F}' - | '\u{EFFFE}'..='\u{EFFFF}' - | '\u{FFFFE}'..='\u{FFFFF}' - | '\u{10FFFE}'..='\u{10FFFF}' => true, + | '\u{E0020}'..='\u{E007F}' => true, _ => false, } } diff --git a/tests/correct_defective_ccs.rs b/tests/correct_defective_ccs.rs index 552b712..e7858ac 100644 --- a/tests/correct_defective_ccs.rs +++ b/tests/correct_defective_ccs.rs @@ -18,7 +18,7 @@ fn defective_css() { check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde"); check_ccs!("\u{200C}bcde", "\u{200C}bcde"); check_ccs!("\u{180F}bcde", "\u{180F}bcde"); - check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde"); + check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{0301}bcde"); check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde"); check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}"); check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a");