Skip to content

BTreeSet intersection, is_subset & difference optimizations #64820

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 2, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 155 additions & 75 deletions src/liballoc/collections/btree/set.rs
Original file line number Diff line number Diff line change
@@ -122,13 +122,16 @@ pub struct Difference<'a, T: 'a> {
}
enum DifferenceInner<'a, T: 'a> {
Stitch {
// iterate all of self and some of other, spotting matches along the way
self_iter: Iter<'a, T>,
other_iter: Peekable<Iter<'a, T>>,
},
Search {
// iterate a small set, look up in the large set
self_iter: Iter<'a, T>,
other_set: &'a BTreeSet<T>,
},
Iterate(Iter<'a, T>), // simply stream self's elements
}

#[stable(feature = "collection_debug", since = "1.17.0")]
@@ -147,6 +150,7 @@ impl<T: fmt::Debug> fmt::Debug for Difference<'_, T> {
self_iter,
other_set: _,
} => f.debug_tuple("Difference").field(&self_iter).finish(),
DifferenceInner::Iterate(iter) => f.debug_tuple("Difference").field(&iter).finish(),
}
}
}
@@ -187,13 +191,16 @@ pub struct Intersection<'a, T: 'a> {
}
enum IntersectionInner<'a, T: 'a> {
Stitch {
// iterate similarly sized sets jointly, spotting matches along the way
a: Iter<'a, T>,
b: Iter<'a, T>,
},
Search {
// iterate a small set, look up in the large set
small_iter: Iter<'a, T>,
large_set: &'a BTreeSet<T>,
},
Answer(Option<&'a T>), // return a specific value or emptiness
}

#[stable(feature = "collection_debug", since = "1.17.0")]
@@ -212,6 +219,9 @@ impl<T: fmt::Debug> fmt::Debug for Intersection<'_, T> {
small_iter,
large_set: _,
} => f.debug_tuple("Intersection").field(&small_iter).finish(),
IntersectionInner::Answer(answer) => {
f.debug_tuple("Intersection").field(&answer).finish()
}
}
}
}
@@ -314,24 +324,51 @@ impl<T: Ord> BTreeSet<T> {
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn difference<'a>(&'a self, other: &'a BTreeSet<T>) -> Difference<'a, T> {
if self.len() > other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
// Self is bigger than or not much smaller than other set.
// Iterate both sets jointly, spotting matches along the way.
Difference {
inner: DifferenceInner::Stitch {
self_iter: self.iter(),
other_iter: other.iter().peekable(),
},
}
let (self_min, self_max) = if let (Some(self_min), Some(self_max)) =
(self.iter().next(), self.iter().next_back())
{
(self_min, self_max)
} else {
// Self is much smaller than other set, or both sets are empty.
// Iterate the small set, searching for matches in the large set.
Difference {
inner: DifferenceInner::Search {
self_iter: self.iter(),
other_set: other,
},
}
return Difference {
inner: DifferenceInner::Iterate(self.iter()),
};
};
let (other_min, other_max) = if let (Some(other_min), Some(other_max)) =
(other.iter().next(), other.iter().next_back())
{
(other_min, other_max)
} else {
return Difference {
inner: DifferenceInner::Iterate(self.iter()),
};
};
Difference {
inner: match (self_min.cmp(other_max), self_max.cmp(other_min)) {
(Greater, _) | (_, Less) => DifferenceInner::Iterate(self.iter()),
(Equal, _) => {
let mut self_iter = self.iter();
self_iter.next();
DifferenceInner::Iterate(self_iter)
}
(_, Equal) => {
let mut self_iter = self.iter();
self_iter.next_back();
DifferenceInner::Iterate(self_iter)
}
_ => {
if self.len() <= other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
DifferenceInner::Search {
self_iter: self.iter(),
other_set: other,
}
} else {
DifferenceInner::Stitch {
self_iter: self.iter(),
other_iter: other.iter().peekable(),
}
}
}
},
}
}

@@ -387,29 +424,48 @@ impl<T: Ord> BTreeSet<T> {
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn intersection<'a>(&'a self, other: &'a BTreeSet<T>) -> Intersection<'a, T> {
let (small, other) = if self.len() <= other.len() {
(self, other)
let (self_min, self_max) = if let (Some(self_min), Some(self_max)) =
(self.iter().next(), self.iter().next_back())
{
(self_min, self_max)
} else {
(other, self)
return Intersection {
inner: IntersectionInner::Answer(None),
};
};
if small.len() > other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
// Small set is not much smaller than other set.
// Iterate both sets jointly, spotting matches along the way.
Intersection {
inner: IntersectionInner::Stitch {
a: small.iter(),
b: other.iter(),
},
}
let (other_min, other_max) = if let (Some(other_min), Some(other_max)) =
(other.iter().next(), other.iter().next_back())
{
(other_min, other_max)
} else {
// Big difference in number of elements, or both sets are empty.
// Iterate the small set, searching for matches in the large set.
Intersection {
inner: IntersectionInner::Search {
small_iter: small.iter(),
large_set: other,
},
}
return Intersection {
inner: IntersectionInner::Answer(None),
};
};
Intersection {
inner: match (self_min.cmp(other_max), self_max.cmp(other_min)) {
(Greater, _) | (_, Less) => IntersectionInner::Answer(None),
(Equal, _) => IntersectionInner::Answer(Some(self_min)),
(_, Equal) => IntersectionInner::Answer(Some(self_max)),
_ => {
if self.len() <= other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
IntersectionInner::Search {
small_iter: self.iter(),
large_set: other,
}
} else if other.len() <= self.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
IntersectionInner::Search {
small_iter: other.iter(),
large_set: self,
}
} else {
IntersectionInner::Stitch {
a: self.iter(),
b: other.iter(),
}
}
}
},
}
}

@@ -544,43 +600,61 @@ impl<T: Ord> BTreeSet<T> {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn is_subset(&self, other: &BTreeSet<T>) -> bool {
// Same result as self.difference(other).next().is_none()
// but the 3 paths below are faster (in order: hugely, 20%, 5%).
// but the code below is faster (hugely in some cases).
if self.len() > other.len() {
false
} else if self.len() > other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
// Self is not much smaller than other set.
// Stolen from TreeMap
let mut x = self.iter();
let mut y = other.iter();
let mut a = x.next();
let mut b = y.next();
while a.is_some() {
if b.is_none() {
return false;
}
let (self_min, self_max) = if let (Some(self_min), Some(self_max)) =
(self.iter().next(), self.iter().next_back())
{
(self_min, self_max)
} else {
return true; // self is empty
};
let (other_min, other_max) = if let (Some(other_min), Some(other_max)) =
(other.iter().next(), other.iter().next_back())
{
(other_min, other_max)
} else {
return false; // other is empty
Copy link
Contributor Author

@ssomers ssomers Oct 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This else-part cannot be reached, due to the performance shortcut on top. It's possible to:

  • merge this let if with the if let above, but then it's not at all clear to the casual reader that it should return true
  • write a panic! explaining this, better than raw unwrap I guess, but pointless extra code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unreachable!("message") is the panic for that, but since we don't need a panic - false is correct, it seems this works just as well.

};
let mut self_iter = self.iter();
match self_min.cmp(other_min) {
Less => return false,
Equal => {
self_iter.next();
}
Greater => (),
}
match self_max.cmp(other_max) {
Greater => return false,
Equal => {
self_iter.next_back();
}
Less => (),
}
if self_iter.len() <= other.len() / ITER_PERFORMANCE_TIPPING_SIZE_DIFF {
// Big difference in number of elements.
for next in self_iter {
if !other.contains(next) {
return false;
}

let a1 = a.unwrap();
let b1 = b.unwrap();

match b1.cmp(a1) {
Less => (),
Greater => return false,
Equal => a = x.next(),
}

b = y.next();
}
true
} else {
// Big difference in number of elements, or both sets are empty.
// Iterate the small set, searching for matches in the large set.
for next in self {
if !other.contains(next) {
return false;
// Self is not much smaller than other set.
let mut other_iter = other.iter();
other_iter.next();
other_iter.next_back();
let mut self_next = self_iter.next();
while let Some(self1) = self_next {
match other_iter.next().map_or(Less, |other1| self1.cmp(other1)) {
Less => return false,
Equal => self_next = self_iter.next(),
Greater => (),
}
}
true
}
true
}

/// Returns `true` if the set is a superset of another,
@@ -1120,6 +1194,7 @@ impl<T> Clone for Difference<'_, T> {
self_iter: self_iter.clone(),
other_set,
},
DifferenceInner::Iterate(iter) => DifferenceInner::Iterate(iter.clone()),
},
}
}
@@ -1138,7 +1213,7 @@ impl<'a, T: Ord> Iterator for Difference<'a, T> {
loop {
match other_iter
.peek()
.map_or(Less, |other_next| Ord::cmp(self_next, other_next))
.map_or(Less, |other_next| self_next.cmp(other_next))
{
Less => return Some(self_next),
Equal => {
@@ -1160,19 +1235,21 @@ impl<'a, T: Ord> Iterator for Difference<'a, T> {
return Some(self_next);
}
},
DifferenceInner::Iterate(iter) => iter.next(),
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
let (self_len, other_len) = match &self.inner {
DifferenceInner::Stitch {
self_iter,
other_iter
other_iter,
} => (self_iter.len(), other_iter.len()),
DifferenceInner::Search {
self_iter,
other_set
other_set,
} => (self_iter.len(), other_set.len()),
DifferenceInner::Iterate(iter) => (iter.len(), 0),
};
(self_len.saturating_sub(other_len), Some(self_len))
}
@@ -1234,6 +1311,7 @@ impl<T> Clone for Intersection<'_, T> {
small_iter: small_iter.clone(),
large_set,
},
IntersectionInner::Answer(answer) => IntersectionInner::Answer(answer.clone()),
},
}
}
@@ -1251,7 +1329,7 @@ impl<'a, T: Ord> Iterator for Intersection<'a, T> {
let mut a_next = a.next()?;
let mut b_next = b.next()?;
loop {
match Ord::cmp(a_next, b_next) {
match a_next.cmp(b_next) {
Less => a_next = a.next()?,
Greater => b_next = b.next()?,
Equal => return Some(a_next),
@@ -1267,15 +1345,17 @@ impl<'a, T: Ord> Iterator for Intersection<'a, T> {
return Some(small_next);
}
},
IntersectionInner::Answer(answer) => answer.take(),
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
let min_len = match &self.inner {
IntersectionInner::Stitch { a, b } => min(a.len(), b.len()),
IntersectionInner::Search { small_iter, .. } => small_iter.len(),
};
(0, Some(min_len))
match &self.inner {
IntersectionInner::Stitch { a, b } => (0, Some(min(a.len(), b.len()))),
IntersectionInner::Search { small_iter, .. } => (0, Some(small_iter.len())),
IntersectionInner::Answer(None) => (0, Some(0)),
IntersectionInner::Answer(Some(_)) => (1, Some(1)),
}
}
}

110 changes: 92 additions & 18 deletions src/liballoc/tests/btree/set.rs
Original file line number Diff line number Diff line change
@@ -48,7 +48,9 @@ fn check<F>(a: &[i32], b: &[i32], expected: &[i32], f: F)
f(&set_a,
&set_b,
&mut |&x| {
assert_eq!(x, expected[i]);
if i < expected.len() {
assert_eq!(x, expected[i]);
}
i += 1;
true
});
@@ -74,31 +76,36 @@ fn test_intersection() {
return;
}

let large = (0..1000).collect::<Vec<_>>();
let large = (0..100).collect::<Vec<_>>();
check_intersection(&[], &large, &[]);
check_intersection(&large, &[], &[]);
check_intersection(&[-1], &large, &[]);
check_intersection(&large, &[-1], &[]);
check_intersection(&[0], &large, &[0]);
check_intersection(&large, &[0], &[0]);
check_intersection(&[999], &large, &[999]);
check_intersection(&large, &[999], &[999]);
check_intersection(&[1000], &large, &[]);
check_intersection(&large, &[1000], &[]);
check_intersection(&[11, 5000, 1, 3, 77, 8924, 103],
check_intersection(&[99], &large, &[99]);
check_intersection(&large, &[99], &[99]);
check_intersection(&[100], &large, &[]);
check_intersection(&large, &[100], &[]);
check_intersection(&[11, 5000, 1, 3, 77, 8924],
&large,
&[1, 3, 11, 77, 103]);
&[1, 3, 11, 77]);
}

#[test]
fn test_intersection_size_hint() {
let x: BTreeSet<i32> = [3, 4].iter().copied().collect();
let y: BTreeSet<i32> = [1, 2, 3].iter().copied().collect();
let mut iter = x.intersection(&y);
assert_eq!(iter.size_hint(), (0, Some(2)));
assert_eq!(iter.size_hint(), (1, Some(1)));
assert_eq!(iter.next(), Some(&3));
assert_eq!(iter.size_hint(), (0, Some(0)));
assert_eq!(iter.next(), None);

iter = y.intersection(&y);
assert_eq!(iter.size_hint(), (0, Some(3)));
assert_eq!(iter.next(), Some(&1));
assert_eq!(iter.size_hint(), (0, Some(2)));
}

#[test]
@@ -111,6 +118,9 @@ fn test_difference() {
check_difference(&[1, 12], &[], &[1, 12]);
check_difference(&[], &[1, 2, 3, 9], &[]);
check_difference(&[1, 3, 5, 9, 11], &[3, 9], &[1, 5, 11]);
check_difference(&[1, 3, 5, 9, 11], &[3, 6, 9], &[1, 5, 11]);
check_difference(&[1, 3, 5, 9, 11], &[0, 1], &[3, 5, 9, 11]);
check_difference(&[1, 3, 5, 9, 11], &[11, 12], &[1, 3, 5, 9]);
check_difference(&[-5, 11, 22, 33, 40, 42],
&[-12, -5, 14, 23, 34, 38, 39, 50],
&[11, 22, 33, 40, 42]);
@@ -119,18 +129,82 @@ fn test_difference() {
return;
}

let large = (0..1000).collect::<Vec<_>>();
let large = (0..100).collect::<Vec<_>>();
check_difference(&[], &large, &[]);
check_difference(&[-1], &large, &[-1]);
check_difference(&[0], &large, &[]);
check_difference(&[999], &large, &[]);
check_difference(&[1000], &large, &[1000]);
check_difference(&[11, 5000, 1, 3, 77, 8924, 103],
check_difference(&[99], &large, &[]);
check_difference(&[100], &large, &[100]);
check_difference(&[11, 5000, 1, 3, 77, 8924],
&large,
&[5000, 8924]);
check_difference(&large, &[], &large);
check_difference(&large, &[-1], &large);
check_difference(&large, &[1000], &large);
check_difference(&large, &[100], &large);
}

#[test]
fn test_difference_size_hint() {
let s246: BTreeSet<i32> = [2, 4, 6].iter().copied().collect();
let s23456: BTreeSet<i32> = (2..=6).collect();
let mut iter = s246.difference(&s23456);
assert_eq!(iter.size_hint(), (0, Some(3)));
assert_eq!(iter.next(), None);

let s12345: BTreeSet<i32> = (1..=5).collect();
iter = s246.difference(&s12345);
assert_eq!(iter.size_hint(), (0, Some(3)));
assert_eq!(iter.next(), Some(&6));
assert_eq!(iter.size_hint(), (0, Some(0)));
assert_eq!(iter.next(), None);

let s34567: BTreeSet<i32> = (3..=7).collect();
iter = s246.difference(&s34567);
assert_eq!(iter.size_hint(), (0, Some(3)));
assert_eq!(iter.next(), Some(&2));
assert_eq!(iter.size_hint(), (0, Some(2)));
assert_eq!(iter.next(), None);

let s1: BTreeSet<i32> = (-9..=1).collect();
iter = s246.difference(&s1);
assert_eq!(iter.size_hint(), (3, Some(3)));

let s2: BTreeSet<i32> = (-9..=2).collect();
iter = s246.difference(&s2);
assert_eq!(iter.size_hint(), (2, Some(2)));
assert_eq!(iter.next(), Some(&4));
assert_eq!(iter.size_hint(), (1, Some(1)));

let s23: BTreeSet<i32> = (2..=3).collect();
iter = s246.difference(&s23);
assert_eq!(iter.size_hint(), (1, Some(3)));
assert_eq!(iter.next(), Some(&4));
assert_eq!(iter.size_hint(), (1, Some(1)));

let s4: BTreeSet<i32> = (4..=4).collect();
iter = s246.difference(&s4);
assert_eq!(iter.size_hint(), (2, Some(3)));
assert_eq!(iter.next(), Some(&2));
assert_eq!(iter.size_hint(), (1, Some(2)));
assert_eq!(iter.next(), Some(&6));
assert_eq!(iter.size_hint(), (0, Some(0)));
assert_eq!(iter.next(), None);

let s56: BTreeSet<i32> = (5..=6).collect();
iter = s246.difference(&s56);
assert_eq!(iter.size_hint(), (1, Some(3)));
assert_eq!(iter.next(), Some(&2));
assert_eq!(iter.size_hint(), (0, Some(2)));

let s6: BTreeSet<i32> = (6..=19).collect();
iter = s246.difference(&s6);
assert_eq!(iter.size_hint(), (2, Some(2)));
assert_eq!(iter.next(), Some(&2));
assert_eq!(iter.size_hint(), (1, Some(1)));

let s7: BTreeSet<i32> = (7..=19).collect();
iter = s246.difference(&s7);
assert_eq!(iter.size_hint(), (3, Some(3)));
}

#[test]
@@ -188,23 +262,23 @@ fn test_is_subset() {
assert_eq!(is_subset(&[1, 2], &[1, 2]), true);
assert_eq!(is_subset(&[1, 2], &[2, 3]), false);
assert_eq!(is_subset(&[-5, 11, 22, 33, 40, 42],
&[-12, -5, 14, 23, 11, 34, 22, 38, 33, 42, 39, 40]),
&[-12, -5, 11, 14, 22, 23, 33, 34, 38, 39, 40, 42]),
true);
assert_eq!(is_subset(&[-5, 11, 22, 33, 40, 42],
&[-12, -5, 14, 23, 34, 38, 22, 11]),
&[-12, -5, 11, 14, 22, 23, 34, 38]),
false);

if cfg!(miri) { // Miri is too slow
return;
}

let large = (0..1000).collect::<Vec<_>>();
let large = (0..100).collect::<Vec<_>>();
assert_eq!(is_subset(&[], &large), true);
assert_eq!(is_subset(&large, &[]), false);
assert_eq!(is_subset(&[-1], &large), false);
assert_eq!(is_subset(&[0], &large), true);
assert_eq!(is_subset(&[1, 2], &large), true);
assert_eq!(is_subset(&[999, 1000], &large), false);
assert_eq!(is_subset(&[99, 100], &large), false);
}

#[test]