diff --git a/src/data_format/mod.rs b/src/data_format/mod.rs index b2e86263..0f8f5048 100644 --- a/src/data_format/mod.rs +++ b/src/data_format/mod.rs @@ -10,7 +10,7 @@ mod storage; pub(crate) mod utils; use crate::cosmetic_filter_cache::CosmeticFilterCache; -use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; +use crate::flatbuffers::unsafe_tools::VerifiedFlatbufferMemory; use crate::network_filter_list::NetworkFilterListParsingError; /// Newer formats start with this magic byte sequence. diff --git a/src/data_format/storage.rs b/src/data_format/storage.rs index 3ffeab14..140d2bfc 100644 --- a/src/data_format/storage.rs +++ b/src/data_format/storage.rs @@ -10,7 +10,7 @@ use rmp_serde as rmps; use serde::{Deserialize, Serialize}; use crate::cosmetic_filter_cache::{CosmeticFilterCache, HostnameRuleDb, ProceduralOrActionFilter}; -use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; +use crate::flatbuffers::unsafe_tools::VerifiedFlatbufferMemory; use crate::utils::Hash; use super::utils::{stabilize_hashmap_serialization, stabilize_hashset_serialization}; diff --git a/src/filters/fb_builder.rs b/src/filters/fb_builder.rs index 1a3134f2..d83833bc 100644 --- a/src/filters/fb_builder.rs +++ b/src/filters/fb_builder.rs @@ -9,7 +9,7 @@ use std::vec; use flatbuffers::WIPOffset; use crate::filters::network::{NetworkFilter, NetworkFilterMaskHelper}; -use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; +use crate::flatbuffers::unsafe_tools::VerifiedFlatbufferMemory; use crate::network_filter_list::token_histogram; use crate::optimizer; use crate::utils::{to_short_hash, Hash, ShortHash}; diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 54498b71..c6d7940c 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use crate::filters::fb_builder::FlatBufferBuilder; use crate::filters::network::{NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable}; -use crate::filters::unsafe_tools::{fb_vector_to_slice, VerifiedFlatbufferMemory}; +use crate::flatbuffers::unsafe_tools::{fb_vector_to_slice, VerifiedFlatbufferMemory}; use crate::regex_manager::RegexManager; use crate::request::Request; diff --git a/src/filters/flat_filter_map.rs b/src/filters/flat_filter_map.rs deleted file mode 100644 index 40df1958..00000000 --- a/src/filters/flat_filter_map.rs +++ /dev/null @@ -1,73 +0,0 @@ -//! Holds the implementation of [FlatFilterMap]. - -use flatbuffers::{Follow, ForwardsUOffset, Vector}; -use std::cmp::PartialOrd; - -/// A map-like container that uses flatbuffer references. -/// Provides O(log n) lookup time using binary search on the sorted index. -pub(crate) struct FlatFilterMap<'a, I: PartialOrd + Copy, V> { - index: &'a [I], - values: Vector<'a, ForwardsUOffset>, -} - -/// Iterator over NetworkFilter objects from [FlatFilterMap] -pub(crate) struct FlatFilterMapIterator<'a, I: PartialOrd + Copy, V> { - current_index: usize, - key: I, - indexes: &'a [I], - values: Vector<'a, ForwardsUOffset>, -} - -impl<'a, I, V> Iterator for FlatFilterMapIterator<'a, I, V> -where - I: PartialOrd + Copy, - V: Follow<'a>, -{ - type Item = (usize, >::Inner); - - fn next(&mut self) -> Option { - if self.current_index < self.indexes.len() { - if self.indexes[self.current_index] != self.key { - return None; - } - let index = self.current_index; - let filter = self.values.get(self.current_index); - self.current_index += 1; - Some((index, filter)) - } else { - None - } - } -} - -impl<'a, I: PartialOrd + Copy, V> FlatFilterMap<'a, I, V> { - /// Construct [FlatFilterMap] from two vectors: - /// - index: sorted array of keys - /// - values: array of values, same length as index - pub fn new(index: &'a [I], values: Vector<'a, ForwardsUOffset>) -> Self { - // Sanity check the size are equal. Note: next() will handle |values| correctly. - debug_assert!(index.len() == values.len()); - - debug_assert!(index.is_sorted()); - - Self { index, values } - } - - /// Get an iterator over NetworkFilter objects with the given hash key. - pub fn get(&self, key: I) -> FlatFilterMapIterator<'a, I, V> { - let start = self.index.partition_point(|x| *x < key); - FlatFilterMapIterator { - current_index: start, - key, - indexes: self.index, - values: self.values, - } - } -} - -impl FlatFilterMap<'_, I, V> { - #[cfg(test)] - pub fn total_size(&self) -> usize { - self.index.len() - } -} diff --git a/src/filters/mod.rs b/src/filters/mod.rs index d18e1f43..c1702a2d 100644 --- a/src/filters/mod.rs +++ b/src/filters/mod.rs @@ -6,6 +6,4 @@ mod network_matchers; pub mod cosmetic; pub(crate) mod fb_builder; pub(crate) mod fb_network; -pub(crate) mod flat_filter_map; pub mod network; -pub(crate) mod unsafe_tools; diff --git a/src/flatbuffers/containers/flat_multimap.rs b/src/flatbuffers/containers/flat_multimap.rs new file mode 100644 index 00000000..a90dcc59 --- /dev/null +++ b/src/flatbuffers/containers/flat_multimap.rs @@ -0,0 +1,86 @@ +use std::marker::PhantomData; + +use crate::flatbuffers::containers::sorted_index::SortedIndex; +use flatbuffers::{Follow, Vector}; + +/// A map-like container that uses flatbuffer references. +/// Provides O(log n) lookup time using binary search on the sorted index. +/// I is a key type, Keys is specific container of keys, &[I] for fast indexing (u32, u64) +/// and flatbuffers::Vector if there is no conversion from Vector (str) to slice. +pub(crate) struct FlatMultiMapView<'a, I: Ord, V, Keys> +where + Keys: SortedIndex, + V: Follow<'a>, +{ + keys: Keys, + values: Vector<'a, V>, + _phantom: PhantomData, +} + +impl<'a, I: Ord + Copy, V, Keys> FlatMultiMapView<'a, I, V, Keys> +where + Keys: SortedIndex + Clone, + V: Follow<'a>, +{ + pub fn new(keys: Keys, values: Vector<'a, V>) -> Self { + debug_assert!(keys.len() == values.len()); + + Self { + keys, + values, + _phantom: PhantomData, + } + } + + pub fn get(&self, key: I) -> Option> { + let index = self.keys.partition_point(|x| *x < key); + if index < self.keys.len() && self.keys.get(index) == key { + Some(FlatMultiMapViewIterator { + index, + key, + keys: self.keys.clone(), // Cloning is 3-4% faster than & in benchmarks + values: self.values, + }) + } else { + None + } + } + + #[cfg(test)] + pub fn total_size(&self) -> usize { + self.keys.len() + } +} + +pub(crate) struct FlatMultiMapViewIterator<'a, I: Ord + Copy, V, Keys> +where + Keys: SortedIndex, + V: Follow<'a>, +{ + index: usize, + key: I, + keys: Keys, + values: Vector<'a, V>, +} + +impl<'a, I, V, Keys> Iterator for FlatMultiMapViewIterator<'a, I, V, Keys> +where + I: Ord + Copy, + V: Follow<'a>, + Keys: SortedIndex, +{ + type Item = (usize, >::Inner); + + fn next(&mut self) -> Option { + if self.index < self.keys.len() && self.keys.get(self.index) == self.key { + self.index += 1; + Some((self.index - 1, self.values.get(self.index - 1))) + } else { + None + } + } +} + +#[cfg(test)] +#[path = "../../../tests/unit/flatbuffers/containers/flat_multimap.rs"] +mod unit_tests; diff --git a/src/flatbuffers/containers/flat_set.rs b/src/flatbuffers/containers/flat_set.rs new file mode 100644 index 00000000..48b1199c --- /dev/null +++ b/src/flatbuffers/containers/flat_set.rs @@ -0,0 +1,49 @@ +#![allow(dead_code)] + +use std::marker::PhantomData; + +use crate::flatbuffers::containers::sorted_index::SortedIndex; + +/// A set-like container that uses flatbuffer references. +/// Provides O(log n) lookup time using binary search on the sorted data. +/// I is a key type, Keys is specific container of keys, &[I] for fast indexing (u32, u64) +/// and flatbuffers::Vector if there is no conversion from Vector (str) to slice. +pub(crate) struct FlatSetView +where + Keys: SortedIndex, +{ + keys: Keys, + _phantom: PhantomData, +} + +impl FlatSetView +where + I: Ord, + Keys: SortedIndex, +{ + pub fn new(keys: Keys) -> Self { + Self { + keys, + _phantom: PhantomData, + } + } + + pub fn contains(&self, key: I) -> bool { + let index = self.keys.partition_point(|x| *x < key); + index < self.keys.len() && self.keys.get(index) == key + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.keys.len() + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +#[cfg(test)] +#[path = "../../../tests/unit/flatbuffers/containers/flat_set.rs"] +mod unit_tests; diff --git a/src/flatbuffers/containers/mod.rs b/src/flatbuffers/containers/mod.rs new file mode 100644 index 00000000..507620de --- /dev/null +++ b/src/flatbuffers/containers/mod.rs @@ -0,0 +1,3 @@ +pub(crate) mod flat_multimap; +pub(crate) mod flat_set; +pub(crate) mod sorted_index; diff --git a/src/flatbuffers/containers/sorted_index.rs b/src/flatbuffers/containers/sorted_index.rs new file mode 100644 index 00000000..166f491f --- /dev/null +++ b/src/flatbuffers/containers/sorted_index.rs @@ -0,0 +1,73 @@ +use flatbuffers::{Follow, Vector}; + +// Represents sorted sequence to perform the binary search. +pub(crate) trait SortedIndex { + fn len(&self) -> usize; + fn get(&self, index: usize) -> I; + fn partition_point(&self, predicate: F) -> usize + where + F: FnMut(&I) -> bool; +} + +// Implementation for slices. Prefer using this with fb_vector_to_slice +// if possible, because it faster than getting values with flatbuffer's +// get method. +impl SortedIndex for &[I] { + #[inline(always)] + fn len(&self) -> usize { + <[I]>::len(self) + } + + #[inline(always)] + fn get(&self, index: usize) -> I { + self[index] + } + + #[inline(always)] + fn partition_point(&self, predicate: F) -> usize + where + F: FnMut(&I) -> bool, + { + debug_assert!(self.is_sorted()); + <[I]>::partition_point(self, predicate) + } +} + +// General implementation for flatbuffers::Vector, it uses get to +// obtain values. +impl<'a, T: Follow<'a>> SortedIndex for Vector<'a, T> +where + T::Inner: Ord, +{ + #[inline(always)] + fn len(&self) -> usize { + Vector::len(self) + } + + #[inline(always)] + fn get(&self, index: usize) -> T::Inner { + Vector::get(self, index) + } + + fn partition_point(&self, mut predicate: F) -> usize + where + F: FnMut(&T::Inner) -> bool, + { + debug_assert!(self.iter().is_sorted()); + + let mut left = 0; + let mut right = self.len(); + + while left < right { + let mid = left + (right - left) / 2; + let value = self.get(mid); + if predicate(&value) { + left = mid + 1; + } else { + right = mid; + } + } + + left + } +} diff --git a/src/flatbuffers/mod.rs b/src/flatbuffers/mod.rs new file mode 100644 index 00000000..61dc0bd6 --- /dev/null +++ b/src/flatbuffers/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod containers; +pub(crate) mod unsafe_tools; diff --git a/src/filters/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs similarity index 100% rename from src/filters/unsafe_tools.rs rename to src/flatbuffers/unsafe_tools.rs diff --git a/src/lib.rs b/src/lib.rs index a0bed513..d6327d2d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,7 @@ pub mod cosmetic_filter_cache; mod data_format; mod engine; pub mod filters; +mod flatbuffers; pub mod lists; mod network_filter_list; mod optimizer; diff --git a/src/network_filter_list.rs b/src/network_filter_list.rs index ea27bb14..c0d2f6dc 100644 --- a/src/network_filter_list.rs +++ b/src/network_filter_list.rs @@ -2,13 +2,15 @@ use std::{collections::HashMap, collections::HashSet, fmt}; +use flatbuffers::ForwardsUOffset; + use crate::filters::fb_network::flat::fb; use crate::filters::fb_network::{FilterDataContext, FlatNetworkFilter}; -use crate::filters::flat_filter_map::FlatFilterMap; use crate::filters::network::{ NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable, }; -use crate::filters::unsafe_tools::fb_vector_to_slice; +use crate::flatbuffers::containers::flat_multimap::FlatMultiMapView; +use crate::flatbuffers::unsafe_tools::fb_vector_to_slice; use crate::regex_manager::RegexManager; use crate::request::Request; use crate::utils::{fast_hash, to_short_hash, Hash, ShortHash}; @@ -60,10 +62,13 @@ pub(crate) struct NetworkFilterList<'a> { pub(crate) filter_data_context: &'a FilterDataContext, } +type FlatNetworkFilterMap<'a> = + FlatMultiMapView<'a, ShortHash, ForwardsUOffset>, &'a [ShortHash]>; + impl NetworkFilterList<'_> { - pub fn get_filter_map(&self) -> FlatFilterMap { + pub fn get_filter_map(&self) -> FlatNetworkFilterMap { let filters_list = &self.list; - FlatFilterMap::new( + FlatNetworkFilterMap::new( fb_vector_to_slice(filters_list.filter_map_index()), filters_list.filter_map_values(), ) @@ -89,18 +94,21 @@ impl NetworkFilterList<'_> { let filter_map = self.get_filter_map(); for token in request.get_tokens_for_match() { - for (index, fb_filter) in filter_map.get(to_short_hash(*token)) { - let filter = FlatNetworkFilter::new(&fb_filter, index, self.filter_data_context); - - // if matched, also needs to be tagged with an active tag (or not tagged at all) - if filter.matches(request, regex_manager) - && filter.tag().is_none_or(|t| active_tags.contains(t)) - { - return Some(CheckResult { - filter_mask: filter.mask, - modifier_option: filter.modifier_option(), - raw_line: filter.raw_line(), - }); + if let Some(iter) = filter_map.get(to_short_hash(*token)) { + for (index, fb_filter) in iter { + let filter = + FlatNetworkFilter::new(&fb_filter, index, self.filter_data_context); + + // if matched, also needs to be tagged with an active tag (or not tagged at all) + if filter.matches(request, regex_manager) + && filter.tag().is_none_or(|t| active_tags.contains(t)) + { + return Some(CheckResult { + filter_mask: filter.mask, + modifier_option: filter.modifier_option(), + raw_line: filter.raw_line(), + }); + } } } } @@ -129,18 +137,21 @@ impl NetworkFilterList<'_> { let filter_map = self.get_filter_map(); for token in request.get_tokens_for_match() { - for (index, fb_filter) in filter_map.get(to_short_hash(*token)) { - let filter = FlatNetworkFilter::new(&fb_filter, index, self.filter_data_context); - - // if matched, also needs to be tagged with an active tag (or not tagged at all) - if filter.matches(request, regex_manager) - && filter.tag().is_none_or(|t| active_tags.contains(t)) - { - filters.push(CheckResult { - filter_mask: filter.mask, - modifier_option: filter.modifier_option(), - raw_line: filter.raw_line(), - }); + if let Some(iter) = filter_map.get(to_short_hash(*token)) { + for (index, fb_filter) in iter { + let filter = + FlatNetworkFilter::new(&fb_filter, index, self.filter_data_context); + + // if matched, also needs to be tagged with an active tag (or not tagged at all) + if filter.matches(request, regex_manager) + && filter.tag().is_none_or(|t| active_tags.contains(t)) + { + filters.push(CheckResult { + filter_mask: filter.mask, + modifier_option: filter.modifier_option(), + raw_line: filter.raw_line(), + }); + } } } } diff --git a/tests/live.rs b/tests/live.rs index cdb1999c..8cfa7a8c 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -230,6 +230,7 @@ fn check_live_from_filterlists() { #[cfg(feature = "resource-assembler")] #[test] +#[ignore = "issues/499"] fn check_live_redirects() { use adblock::resources::resource_assembler::assemble_web_accessible_resources; diff --git a/tests/unit/flatbuffers/containers/flat_multimap.rs b/tests/unit/flatbuffers/containers/flat_multimap.rs new file mode 100644 index 00000000..74505bd6 --- /dev/null +++ b/tests/unit/flatbuffers/containers/flat_multimap.rs @@ -0,0 +1,102 @@ +#[cfg(test)] +mod tests { + use super::super::*; + + // Helper function to create a Vector from a slice + fn create_vector_u32<'a>( + builder: &'a mut flatbuffers::FlatBufferBuilder, + data: &'a [u32], + ) -> flatbuffers::Vector<'a, u32> { + let vec_offset = builder.create_vector(data); + builder.finish(vec_offset, None); + let buf = builder.finished_data(); + flatbuffers::root::>(buf).expect("OK") + } + + #[test] + fn test_empty_map() { + let index: &[u32] = &[]; + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let values = create_vector_u32(&mut builder, &[]); + let map = FlatMultiMapView::new(index, values); + + assert_eq!(map.total_size(), 0); + assert!(map.get(1).is_none()); + } + + #[test] + fn test_single_element() { + let index: &[u32] = &[1]; + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let values = create_vector_u32(&mut builder, &[100]); + let map = FlatMultiMapView::new(index, values); + + assert_eq!(map.total_size(), 1); + + // Test existing key + let mut iter = map.get(1).unwrap(); + assert_eq!(iter.next(), Some((0, 100))); + assert_eq!(iter.next(), None); + + // Test non-existing key + assert!(map.get(2).is_none()); + } + + #[test] + fn test_multiple_elements() { + let index: &[u32] = &[1, 1, 2, 2, 2, 3]; + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let values = create_vector_u32(&mut builder, &[10, 20, 30, 40, 50, 60]); + + let map = FlatMultiMapView::new(index, values); + + assert_eq!(map.total_size(), 6); + + // Test key with single value + let mut iter = map.get(3).unwrap(); + assert_eq!(iter.next(), Some((5, 60))); + assert_eq!(iter.next(), None); + + // Test key with multiple values + let mut iter = map.get(2).unwrap(); + assert_eq!(iter.next(), Some((2, 30))); + assert_eq!(iter.next(), Some((3, 40))); + assert_eq!(iter.next(), Some((4, 50))); + assert_eq!(iter.next(), None); + + // Test non-existing key + assert!(map.get(4).is_none()); + } + + #[test] + fn test_all_same_keys() { + let index: &[u32] = &[5, 5, 5]; + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let values = create_vector_u32(&mut builder, &[100, 200, 300]); + let map = FlatMultiMapView::new(index, values); + + assert_eq!(map.total_size(), 3); + + let mut iter = map.get(5).unwrap(); + assert_eq!(iter.next(), Some((0, 100))); + assert_eq!(iter.next(), Some((1, 200))); + assert_eq!(iter.next(), Some((2, 300))); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_non_contiguous_keys() { + let index: &[u32] = &[1, 3, 5]; + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let values = create_vector_u32(&mut builder, &[10, 30, 50]); + let map = FlatMultiMapView::new(index, values); + + assert_eq!(map.total_size(), 3); + + assert_eq!(map.get(1).unwrap().next(), Some((0, 10))); + assert_eq!(map.get(3).unwrap().next(), Some((1, 30))); + assert_eq!(map.get(5).unwrap().next(), Some((2, 50))); + assert!(map.get(2).is_none()); + assert!(map.get(4).is_none()); + } +} diff --git a/tests/unit/flatbuffers/containers/flat_set.rs b/tests/unit/flatbuffers/containers/flat_set.rs new file mode 100644 index 00000000..2fb37813 --- /dev/null +++ b/tests/unit/flatbuffers/containers/flat_set.rs @@ -0,0 +1,19 @@ +#[cfg(test)] +mod tests { + use super::super::*; + + #[test] + fn test_flat_set_view() { + let data = vec![1, 2, 2, 3, 4, 4, 4, 5]; + let set = FlatSetView::::new(&data); + + // Test contains + assert!(set.contains(1)); + assert!(set.contains(2)); + assert!(set.contains(4)); + assert!(!set.contains(6)); + + // Test len + assert_eq!(set.len(), 8); + } +}