From 8bd6e3a2d66e32fc1cb3a351cbc3b6a6a0535734 Mon Sep 17 00:00:00 2001 From: Basile Henry Date: Sun, 7 Mar 2021 19:18:37 +0100 Subject: [PATCH] Add unicode_word_indices The iterator UnicodeWordIndices is similar to UnicodeWord but also provides byte offsets for each word --- src/lib.rs | 31 ++++++++++++++++++++++++++++++- src/word.rs | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b0ed2d1..ed74f8f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,7 +66,7 @@ extern crate quickcheck; pub use grapheme::{Graphemes, GraphemeIndices}; pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords}; +pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices}; pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences}; mod grapheme; @@ -146,6 +146,30 @@ pub trait UnicodeSegmentation { /// ``` fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>; + /// Returns an iterator over the words of `self`, separated on + /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their + /// offsets. + /// + /// Here, "words" are just those substrings which, after splitting on + /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the + /// substring must contain at least one character with the + /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) + /// property, or with + /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). + /// + /// # Example + /// + /// ``` + /// # use self::unicode_segmentation::UnicodeSegmentation; + /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; + /// let uwi1 = uwis.unicode_word_indices().collect::>(); + /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"), + /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")]; + /// + /// assert_eq!(&uwi1[..], b); + /// ``` + fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>; + /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -249,6 +273,11 @@ impl UnicodeSegmentation for str { word::new_unicode_words(self) } + #[inline] + fn unicode_word_indices(&self) -> UnicodeWordIndices { + word::new_unicode_word_indices(self) + } + #[inline] fn split_word_bounds(&self) -> UWordBounds { word::new_word_bounds(self) diff --git a/src/word.rs b/src/word.rs index 179d122..b9bd956 100644 --- a/src/word.rs +++ b/src/word.rs @@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } } +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +pub struct UnicodeWordIndices<'a> { + inner: Filter, fn(&(usize, &str)) -> bool>, +} + +impl<'a> Iterator for UnicodeWordIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() } +} +impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() } +} + /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { } #[inline] -pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { - use super::UnicodeSegmentation; +fn has_alphanumeric(s: &&str) -> bool { use tables::util::is_alphanumeric; - fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } - let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer + s.chars().any(|c| is_alphanumeric(c)) +} + +#[inline] +pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { + use super::UnicodeSegmentation; UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } } + +#[inline] +pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> { + use super::UnicodeSegmentation; + + UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) } +}