From d4decaebd98acb49088f96ac64df4478ab11ee60 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Sun, 3 Mar 2024 21:54:59 -0500
Subject: [PATCH 1/3] Add API to correct defective combining character
 sequences

---
 scripts/unicode.py             | 115 ++++++++++++++++++++-
 src/correct_ccs.rs             | 177 +++++++++++++++++++++++++++++++++
 src/lib.rs                     |  30 ++++++
 src/tables.rs                  |  63 ++++++++++++
 tests/correct_defective_ccs.rs |  29 ++++++
 5 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 src/correct_ccs.rs
 create mode 100644 tests/correct_defective_ccs.rs

diff --git a/scripts/unicode.py b/scripts/unicode.py
index f874f16..52b2793 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -19,6 +19,7 @@
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the tables.rs and normalization_tests.rs files into git.
 import collections
+import re
 import urllib.request
 from itertools import batched
 
@@ -67,6 +68,8 @@
 class UnicodeData(object):
     def __init__(self):
         self._load_unicode_data()
+        self._load_default_ignorable_marks()
+
         self.norm_props = self._load_norm_props()
         self.norm_tests = self._load_norm_tests()
 
@@ -101,6 +104,11 @@ def _load_unicode_data(self):
         self.general_category_mark = []
         self.general_category_public_assigned = []
 
+        # Characters that cannot be part of a combining character sequence:
+        # control characters, format characters other than ZWJ and ZWNJ,
+        # the line and paragraph separators, and noncharacters.
+        self.not_in_ccs = []
+
         assigned_start = 0;
         prev_char_int = -1;
         prev_name = "";
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
             if category == 'M' or 'M' in expanded_categories.get(category, []):
                 self.general_category_mark.append(char_int)
 
+            if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
+                self.not_in_ccs.append(char_int)
+
             assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
             if category not in ['Co', 'Cs']:
                 if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -136,6 +147,44 @@ def _load_unicode_data(self):
 
         self.general_category_public_assigned.append((assigned_start, prev_char_int))
 
+        # Mark noncharacters as nongraphic
+        for i in range(0xFDD0, 0xFDF0):
+            self.not_in_ccs.append(i)
+        for prefix in range(0, 0x11):
+            shifted = prefix << 16
+            self.not_in_ccs.append(shifted | 0xFFFE)
+            self.not_in_ccs.append(shifted | 0xFFFF)
+
+        self.not_in_ccs.sort()
+
+    def _load_default_ignorable_marks(self):
+        default_ignorable_cps = set()
+
+        single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
+        multiple = re.compile(
+            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
+        )
+
+        for line in self._fetch("DerivedCoreProperties.txt").splitlines():
+            raw_data = None  # (low, high)
+            if match := single.match(line):
+                raw_data = (match.group(1), match.group(1))
+            elif match := multiple.match(line):
+                raw_data = (match.group(1), match.group(2))
+            else:
+                continue
+            low = int(raw_data[0], 16)
+            high = int(raw_data[1], 16)
+            for cp in range(low, high + 1):
+                default_ignorable_cps.add(cp)
+
+        self.default_ignorable_marks = []
+        for cp in self.general_category_mark:
+            if cp in default_ignorable_cps:
+                self.default_ignorable_marks.append(cp)
+
+        self.default_ignorable_marks.sort()
+
     def _load_cjk_compat_ideograph_variants(self):
         for line in self._fetch("StandardizedVariants.txt").splitlines():
             strip_comments = line.split('#', 1)[0].strip()
@@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out):
 
 def gen_public_assigned(general_category_public_assigned, out):
     # This could be done as a hash but the table is somewhat small.
-    out.write("#[inline]\n")
+    out.write("\n#[inline]\n")
     out.write("pub fn is_public_assigned(c: char) -> bool {\n")
     out.write("    match c {\n")
 
@@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out):
     out.write("    }\n")
     out.write("}\n")
 
+def gen_not_in_ccs(not_in_ccs, out):
+    # List of codepoints to list of ranges
+    range_list = []
+    for cp in not_in_ccs:
+        if len(range_list) != 0 and range_list[-1][1] == cp - 1:
+            range_list[-1] = (range_list[-1][0], cp)
+        else:
+            range_list.append((cp, cp))
+
+    out.write("\n#[inline]\n")
+    out.write("pub fn not_in_ccs(c: char) -> bool {\n")
+    out.write("    match c {\n")
+
+    start = True
+    for first, last in range_list:
+        if start:
+            out.write("        ")
+            start = False
+        else:
+            out.write("\n        | ")
+        if first == last:
+            out.write("'\\u{%s}'" % hexify(first))
+        else:
+            out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
+    out.write(" => true,\n")
+
+    out.write("        _ => false,\n")
+    out.write("    }\n")
+    out.write("}\n")
+
+def gen_default_ignorable_mark(default_ignorable_marks, out):
+    # List of codepoints to list of ranges
+    range_list = []
+    for cp in default_ignorable_marks:
+        if len(range_list) != 0 and range_list[-1][1] == cp - 1:
+            range_list[-1] = (range_list[-1][0], cp)
+        else:
+            range_list.append((cp, cp))
+
+    out.write("\n#[inline]\n")
+    out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
+    out.write("    match c {\n")
+
+    start = True
+    for first, last in range_list:
+        if start:
+            out.write("        ")
+            start = False
+        else:
+            out.write("\n        | ")
+        if first == last:
+            out.write("'\\u{%s}'" % hexify(first))
+        else:
+            out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
+    out.write(" => true,\n")
+
+    out.write("        _ => false,\n")
+    out.write("    }\n")
+    out.write("}\n")
+
 def gen_stream_safe(leading, trailing, out):
     # This could be done as a hash but the table is very small.
     out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
 
         gen_public_assigned(data.general_category_public_assigned, out)
 
+        gen_not_in_ccs(data.not_in_ccs, out)
+
+        gen_default_ignorable_mark(data.default_ignorable_marks, out)
+
         gen_nfc_qc(data.norm_props, out)
 
         gen_nfkc_qc(data.norm_props, out)
diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs
new file mode 100644
index 0000000..7395b88
--- /dev/null
+++ b/src/correct_ccs.rs
@@ -0,0 +1,177 @@
+#[cfg(not(feature = "std"))]
+use alloc::collections::VecDeque;
+use core::iter::FusedIterator;
+#[cfg(feature = "std")]
+use std::collections::VecDeque;
+
+use crate::{lookups, tables};
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum CcsKind {
+    /// A CCS base character (graphic character other than combining mark).
+    Base,
+
+    /// A combining character other than a `Default_Ignorable_Code_Point`.
+    NonIgnorableCombining,
+
+    /// A default-ignorable combining character, ZWJ, or ZWNJ.
+    IgnorableCombining,
+}
+
+impl CcsKind {
+    fn of(c: char) -> Option<Self> {
+        if c == '\u{200C}' || c == '\u{200D}' {
+            // ZWNJ || ZWJ
+            Some(CcsKind::IgnorableCombining)
+        } else if lookups::is_combining_mark(c) {
+            if tables::is_default_ignorable_mark(c) {
+                Some(CcsKind::IgnorableCombining)
+            } else {
+                Some(CcsKind::NonIgnorableCombining)
+            }
+        } else if tables::not_in_ccs(c) {
+            None
+        } else {
+            Some(CcsKind::Base)
+        }
+    }
+}
+
+/// An iterator over the string that corrects
+/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
+/// by inserting U+00A0 NO-BREAK SPACE in front of them.
+///
+/// For the purposes of this iterator, private use characters,
+/// as well as unassigned codepoints other than noncharacters,
+/// are considered valid base characters,
+/// so combining character sequences that start with such will not be modified.
+///
+/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
+/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
+/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
+#[derive(Clone, Debug)]
+pub struct CorrectDefectiveCcs<I> {
+    /// Whether the last character emitted was part of a CCS.
+    in_ccs: bool,
+    buffer: VecDeque<Option<char>>,
+    /// Whether the last character in `buffer` is part of a CCS.
+    /// (Updated only when `is_ccs` is set from false to true).
+    end_of_buffer_in_ccs: bool,
+    iter: I,
+}
+
+impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.in_ccs {
+            if let Some(c) = self.buffer.pop_front() {
+                // Empty buffer
+
+                if self.buffer.is_empty() {
+                    self.in_ccs = self.end_of_buffer_in_ccs;
+                }
+                c
+            } else {
+                // Forward from inner iterator
+
+                let c = self.iter.next();
+                if c.map_or(true, tables::not_in_ccs) {
+                    self.in_ccs = false;
+                }
+                c
+            }
+        } else {
+            if self.buffer.is_empty() {
+                // We don't have a buffer of default ignorable combining characters built up
+
+                let c = self.iter.next()?;
+                match CcsKind::of(c) {
+                    // Character not in CCS, just forward it
+                    None => return Some(c),
+
+                    // Character starts non-defective CCS,
+                    // label ourselves as in CCS and forward it
+                    Some(CcsKind::Base) => {
+                        self.in_ccs = true;
+                        return Some(c);
+                    }
+
+                    // Character starts defective CCS and is not default-ignorable.
+                    // Put it in the buffer to emit on next iteration,
+                    // mark ourselves as in CCS,
+                    // and emit NO-BREAK SPACE
+                    Some(CcsKind::NonIgnorableCombining) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        self.buffer.push_back(Some(c));
+                        return Some('\u{00A0}'); // NO-BREAK SPACE
+                    }
+
+                    // Character starts defective CCS and is default-ignorable.
+                    // Put it in the buffer, and fall through to loop below
+                    // to find out whether we emit a NO-BREAK SPACE first.
+                    Some(CcsKind::IgnorableCombining) => {
+                        self.buffer.push_back(Some(c));
+                    }
+                }
+            }
+
+            loop {
+                // We do have a buffer of default ignorable combining characters built up,
+                // and we need to figure out whether to emit a NO-BREAK SPACE first.
+
+                let c = self.iter.next();
+                match c.and_then(CcsKind::of) {
+                    // Inner iterator yielded character outside CCS (or `None`).
+                    // Emit the built-up buffer with no leading NO-BREAK SPACE.
+                    None => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = false;
+                        let ret = self.buffer.pop_front().unwrap();
+                        self.buffer.push_back(c);
+                        return ret;
+                    }
+
+                    // Inner iterator yielded character that starts a new CCS.
+                    // Emit the built-up buffer with no leading NO-BREAK SPACE.
+                    Some(CcsKind::Base) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        let ret = self.buffer.pop_front().unwrap();
+                        self.buffer.push_back(c);
+                        return ret;
+                    }
+
+                    // Inner iterator yielded non-ignorable combining character.
+                    // Emit the built-up buffer with leading NO-BREAK SPACE.
+                    Some(CcsKind::NonIgnorableCombining) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        self.buffer.push_back(c);
+                        return Some('\u{00A0}'); // NO-BREAK SPACE
+                    }
+
+                    // Inner iterator yielded ignorable combining character.
+                    // Add it to the buffer, don't emit anything.
+                    Some(CcsKind::IgnorableCombining) => {
+                        self.buffer.push_back(c);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
+
+impl<I> CorrectDefectiveCcs<I> {
+    pub(crate) fn new(iter: I) -> Self {
+        Self {
+            in_ccs: false,
+            buffer: VecDeque::new(),
+            end_of_buffer_in_ccs: false,
+            iter,
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index cc0a850..cc20865 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,6 +52,7 @@ extern crate core;
 
 extern crate tinyvec;
 
+pub use crate::correct_ccs::CorrectDefectiveCcs;
 pub use crate::decompose::Decompositions;
 pub use crate::quick_check::{
     is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
@@ -64,6 +65,7 @@ pub use crate::stream_safe::StreamSafe;
 pub use crate::tables::UNICODE_VERSION;
 use core::{option, str::Chars};
 
+mod correct_ccs;
 mod decompose;
 mod lookups;
 mod normalize;
@@ -128,6 +130,19 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// An Iterator over the string with Conjoining Grapheme Joiner characters
     /// inserted according to the Stream-Safe Text Process (UAX15-D4)
     fn stream_safe(self) -> StreamSafe<I>;
+
+    /// An iterator over the string with
+    /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
+    /// corrected via the insertion of U+00A0 NO-BREAK SPACE.
+    ///
+    /// Sequences starting with a private use character or an unassigned codepoint that is not a noncharacter
+    /// are not corrected. Additionally, combining character sequences consisting entirely of
+    /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
+    /// are also left untouched. Handling this last case may require the iterator
+    /// to buffer up to the entire length of the input;
+    /// this iterator is therefore *not* "stream safe"
+    /// *even if* used in combination with [`stream_safe()`][UnicodeNormalization::stream_safe].
+    fn correct_defective_ccs(self) -> CorrectDefectiveCcs<I>;
 }
 
 impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
@@ -160,6 +175,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
     fn stream_safe(self) -> StreamSafe<Chars<'a>> {
         StreamSafe::new(self.chars())
     }
+
+    #[inline]
+    fn correct_defective_ccs(self) -> CorrectDefectiveCcs<Chars<'a>> {
+        CorrectDefectiveCcs::new(self.chars())
+    }
 }
 
 impl UnicodeNormalization<option::IntoIter<char>> for char {
@@ -192,6 +212,11 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
     fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
         StreamSafe::new(Some(self).into_iter())
     }
+
+    #[inline]
+    fn correct_defective_ccs(self) -> CorrectDefectiveCcs<option::IntoIter<char>> {
+        CorrectDefectiveCcs::new(Some(self).into_iter())
+    }
 }
 
 impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
@@ -224,4 +249,9 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
     fn stream_safe(self) -> StreamSafe<I> {
         StreamSafe::new(self)
     }
+
+    #[inline]
+    fn correct_defective_ccs(self) -> CorrectDefectiveCcs<I> {
+        CorrectDefectiveCcs::new(self)
+    }
 }
diff --git a/src/tables.rs b/src/tables.rs
index 4b6bf97..0197489 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -20242,6 +20242,7 @@ pub(crate) const COMBINING_MARK_KV: &[u32] = &[
     0xE0193, 0x11D40, 0x005B9, 0x00F7D, 0x16F5A, 0xE0197, 0x009CD, 0x00FB5, 0x1DA07, 0xE01D1,
     0x0A880, 0x01A7C, 0x11CA5, 0x009CB, 0x00FB3, 0x00825, 0x1103A, 0x00827, 0x1E94A, 0x008F8,
 ];
+
 #[inline]
 pub fn is_public_assigned(c: char) -> bool {
     match c {
@@ -20954,6 +20955,68 @@ pub fn is_public_assigned(c: char) -> bool {
     }
 }
 
+#[inline]
+pub fn not_in_ccs(c: char) -> bool {
+    match c {
+        '\u{0000}'..='\u{001F}'
+        | '\u{007F}'..='\u{009F}'
+        | '\u{00AD}'
+        | '\u{0600}'..='\u{0605}'
+        | '\u{061C}'
+        | '\u{06DD}'
+        | '\u{070F}'
+        | '\u{0890}'..='\u{0891}'
+        | '\u{08E2}'
+        | '\u{180E}'
+        | '\u{200B}'
+        | '\u{200E}'..='\u{200F}'
+        | '\u{2028}'..='\u{202E}'
+        | '\u{2060}'..='\u{2064}'
+        | '\u{2066}'..='\u{206F}'
+        | '\u{FDD0}'..='\u{FDEF}'
+        | '\u{FEFF}'
+        | '\u{FFF9}'..='\u{FFFB}'
+        | '\u{FFFE}'..='\u{FFFF}'
+        | '\u{110BD}'
+        | '\u{110CD}'
+        | '\u{13430}'..='\u{1343F}'
+        | '\u{1BCA0}'..='\u{1BCA3}'
+        | '\u{1D173}'..='\u{1D17A}'
+        | '\u{1FFFE}'..='\u{1FFFF}'
+        | '\u{2FFFE}'..='\u{2FFFF}'
+        | '\u{3FFFE}'..='\u{3FFFF}'
+        | '\u{4FFFE}'..='\u{4FFFF}'
+        | '\u{5FFFE}'..='\u{5FFFF}'
+        | '\u{6FFFE}'..='\u{6FFFF}'
+        | '\u{7FFFE}'..='\u{7FFFF}'
+        | '\u{8FFFE}'..='\u{8FFFF}'
+        | '\u{9FFFE}'..='\u{9FFFF}'
+        | '\u{AFFFE}'..='\u{AFFFF}'
+        | '\u{BFFFE}'..='\u{BFFFF}'
+        | '\u{CFFFE}'..='\u{CFFFF}'
+        | '\u{DFFFE}'..='\u{DFFFF}'
+        | '\u{E0001}'
+        | '\u{E0020}'..='\u{E007F}'
+        | '\u{EFFFE}'..='\u{EFFFF}'
+        | '\u{FFFFE}'..='\u{FFFFF}'
+        | '\u{10FFFE}'..='\u{10FFFF}' => true,
+        _ => false,
+    }
+}
+
+#[inline]
+pub fn is_default_ignorable_mark(c: char) -> bool {
+    match c {
+        '\u{034F}'
+        | '\u{17B4}'..='\u{17B5}'
+        | '\u{180B}'..='\u{180D}'
+        | '\u{180F}'
+        | '\u{FE00}'..='\u{FE0F}'
+        | '\u{E0100}'..='\u{E01EF}' => true,
+        _ => false,
+    }
+}
+
 #[inline]
 #[allow(ellipsis_inclusive_range_patterns)]
 pub fn qc_nfc(c: char) -> IsNormalized {
diff --git a/tests/correct_defective_ccs.rs b/tests/correct_defective_ccs.rs
new file mode 100644
index 0000000..552b712
--- /dev/null
+++ b/tests/correct_defective_ccs.rs
@@ -0,0 +1,29 @@
+use unicode_normalization::UnicodeNormalization;
+
+macro_rules! check_ccs {
+    ($input: expr, $expected_out: expr) => {
+        assert_eq!(
+            $input.correct_defective_ccs().collect::<String>(),
+            $expected_out
+        )
+    };
+}
+
+#[test]
+fn defective_css() {
+    check_ccs!("", "");
+    check_ccs!("abcde", "abcde");
+    check_ccs!("a\u{0301}bcde", "a\u{0301}bcde");
+    check_ccs!("\u{0301}bcde", "\u{00A0}\u{0301}bcde");
+    check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde");
+    check_ccs!("\u{200C}bcde", "\u{200C}bcde");
+    check_ccs!("\u{180F}bcde", "\u{180F}bcde");
+    check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde");
+    check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde");
+    check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}");
+    check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a");
+    check_ccs!(
+        "\u{180F}\u{180F}\u{180F}\u{0301}",
+        "\u{00A0}\u{180F}\u{180F}\u{180F}\u{0301}"
+    );
+}

From 0b13808b20a0c3332677f7a2dfe629535d9a73e9 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 4 Mar 2024 21:14:54 -0500
Subject: [PATCH 2/3] Add note on `correct_defective_ccs` usage

---
 src/correct_ccs.rs | 2 +-
 src/lib.rs         | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs
index 7395b88..186cebd 100644
--- a/src/correct_ccs.rs
+++ b/src/correct_ccs.rs
@@ -44,7 +44,7 @@ impl CcsKind {
 /// For the purposes of this iterator, private use characters,
 /// as well as unassigned codepoints other than noncharacters,
 /// are considered valid base characters,
-/// so combining character sequences that start with such will not be modified.
+/// so combining character sequences that follow such will not be modified.
 ///
 /// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
 /// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
diff --git a/src/lib.rs b/src/lib.rs
index cc20865..8ab97bd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -133,9 +133,13 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
 
     /// An iterator over the string with
     /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
-    /// corrected via the insertion of U+00A0 NO-BREAK SPACE.
+    /// corrected via the insertion of U+00A0 NO-BREAK SPACE in front of them.
     ///
-    /// Sequences starting with a private use character or an unassigned codepoint that is not a noncharacter
+    /// This helps ensure that the sequences will be displayed correctly and consistently,
+    /// with the correct advance width,
+    /// in diverse contexts (for example, when printed to a terminal).
+    ///
+    /// Sequences following a private use character or an unassigned codepoint that is not a noncharacter
     /// are not corrected. Additionally, combining character sequences consisting entirely of
     /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
     /// are also left untouched. Handling this last case may require the iterator

From 2dbe07f814df2c18e56701f8db839b19a9d74237 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Wed, 13 Mar 2024 13:47:15 -0400
Subject: [PATCH 3/3] Don't handle noncharacters differently than other
 unassigned codepoints

---
 scripts/unicode.py             | 10 +---------
 src/correct_ccs.rs             |  3 +--
 src/lib.rs                     |  2 +-
 src/tables.rs                  | 20 +-------------------
 tests/correct_defective_ccs.rs |  2 +-
 5 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index 52b2793..5f1e8fe 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -106,7 +106,7 @@ def _load_unicode_data(self):
 
         # Characters that cannot be part of a combining character sequence:
         # control characters, format characters other than ZWJ and ZWNJ,
-        # the line and paragraph separators, and noncharacters.
+        # and the line and paragraph separators.
         self.not_in_ccs = []
 
         assigned_start = 0;
@@ -147,14 +147,6 @@ def _load_unicode_data(self):
 
         self.general_category_public_assigned.append((assigned_start, prev_char_int))
 
-        # Mark noncharacters as nongraphic
-        for i in range(0xFDD0, 0xFDF0):
-            self.not_in_ccs.append(i)
-        for prefix in range(0, 0x11):
-            shifted = prefix << 16
-            self.not_in_ccs.append(shifted | 0xFFFE)
-            self.not_in_ccs.append(shifted | 0xFFFF)
-
         self.not_in_ccs.sort()
 
     def _load_default_ignorable_marks(self):
diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs
index 186cebd..4619cf9 100644
--- a/src/correct_ccs.rs
+++ b/src/correct_ccs.rs
@@ -41,8 +41,7 @@ impl CcsKind {
 /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
 /// by inserting U+00A0 NO-BREAK SPACE in front of them.
 ///
-/// For the purposes of this iterator, private use characters,
-/// as well as unassigned codepoints other than noncharacters,
+/// For the purposes of this iterator, private use characters and unassigned codepoints
 /// are considered valid base characters,
 /// so combining character sequences that follow such will not be modified.
 ///
diff --git a/src/lib.rs b/src/lib.rs
index 8ab97bd..a924a46 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -139,7 +139,7 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// with the correct advance width,
     /// in diverse contexts (for example, when printed to a terminal).
     ///
-    /// Sequences following a private use character or an unassigned codepoint that is not a noncharacter
+    /// Sequences following a private use character or an unassigned codepoint
     /// are not corrected. Additionally, combining character sequences consisting entirely of
     /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
     /// are also left untouched. Handling this last case may require the iterator
diff --git a/src/tables.rs b/src/tables.rs
index 0197489..ac58cea 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -20973,33 +20973,15 @@ pub fn not_in_ccs(c: char) -> bool {
         | '\u{2028}'..='\u{202E}'
         | '\u{2060}'..='\u{2064}'
         | '\u{2066}'..='\u{206F}'
-        | '\u{FDD0}'..='\u{FDEF}'
         | '\u{FEFF}'
         | '\u{FFF9}'..='\u{FFFB}'
-        | '\u{FFFE}'..='\u{FFFF}'
         | '\u{110BD}'
         | '\u{110CD}'
         | '\u{13430}'..='\u{1343F}'
         | '\u{1BCA0}'..='\u{1BCA3}'
         | '\u{1D173}'..='\u{1D17A}'
-        | '\u{1FFFE}'..='\u{1FFFF}'
-        | '\u{2FFFE}'..='\u{2FFFF}'
-        | '\u{3FFFE}'..='\u{3FFFF}'
-        | '\u{4FFFE}'..='\u{4FFFF}'
-        | '\u{5FFFE}'..='\u{5FFFF}'
-        | '\u{6FFFE}'..='\u{6FFFF}'
-        | '\u{7FFFE}'..='\u{7FFFF}'
-        | '\u{8FFFE}'..='\u{8FFFF}'
-        | '\u{9FFFE}'..='\u{9FFFF}'
-        | '\u{AFFFE}'..='\u{AFFFF}'
-        | '\u{BFFFE}'..='\u{BFFFF}'
-        | '\u{CFFFE}'..='\u{CFFFF}'
-        | '\u{DFFFE}'..='\u{DFFFF}'
         | '\u{E0001}'
-        | '\u{E0020}'..='\u{E007F}'
-        | '\u{EFFFE}'..='\u{EFFFF}'
-        | '\u{FFFFE}'..='\u{FFFFF}'
-        | '\u{10FFFE}'..='\u{10FFFF}' => true,
+        | '\u{E0020}'..='\u{E007F}' => true,
         _ => false,
     }
 }
diff --git a/tests/correct_defective_ccs.rs b/tests/correct_defective_ccs.rs
index 552b712..e7858ac 100644
--- a/tests/correct_defective_ccs.rs
+++ b/tests/correct_defective_ccs.rs
@@ -18,7 +18,7 @@ fn defective_css() {
     check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde");
     check_ccs!("\u{200C}bcde", "\u{200C}bcde");
     check_ccs!("\u{180F}bcde", "\u{180F}bcde");
-    check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde");
+    check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{0301}bcde");
     check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde");
     check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}");
     check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a");