From 1bcdb158d7815d4ce169acd9493e2fbbc988413a Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Fri, 20 Jun 2025 19:20:05 +0400 Subject: [PATCH 1/7] Use the one flatbuffer to store all lists --- src/blocker.rs | 235 +++++++----- src/data_format/storage.rs | 143 +------- src/filters/fb_network.rs | 170 +-------- src/filters/flat_builder.rs | 264 ++++++++++++++ src/filters/mod.rs | 1 + src/filters/unsafe_tools.rs | 14 +- src/flatbuffers/fb_network_filter.fbs | 6 +- .../fb_network_filter_generated.rs | 344 +++++++++++++----- src/network_filter_list.rs | 163 +-------- tests/unit/blocker.rs | 12 +- 10 files changed, 703 insertions(+), 649 deletions(-) create mode 100644 src/filters/flat_builder.rs diff --git a/src/blocker.rs b/src/blocker.rs index 11b7e3a0..edb77aa3 100644 --- a/src/blocker.rs +++ b/src/blocker.rs @@ -3,10 +3,13 @@ use memchr::{memchr as find_char, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use serde::Serialize; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::ops::DerefMut; +use crate::filters::fb_network::NetworkFilterSharedState; +use crate::filters::flat_builder::FlatBufferBuilder; use crate::filters::network::{NetworkFilter, NetworkFilterMaskHelper}; +use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; use crate::network_filter_list::NetworkFilterList; use crate::regex_manager::{RegexManager, RegexManagerDiscardPolicy}; use crate::request::Request; @@ -64,28 +67,32 @@ pub struct BlockerResult { // pass empty set for the rest static NO_TAGS: Lazy> = Lazy::new(HashSet::new); +// TODO: move to a proper place +pub(crate) enum FilterId { + Csp = 0, + Exceptions = 1, + Importants = 2, + Redirects = 3, + RemoveParam = 4, + Filters = 5, + GenericHide = 6, + TaggedFiltersAll = 7, + Size = 8, +} /// Stores network filters for efficient querying. pub struct Blocker { - pub(crate) csp: NetworkFilterList, - pub(crate) exceptions: NetworkFilterList, - pub(crate) importants: NetworkFilterList, - pub(crate) redirects: NetworkFilterList, - pub(crate) removeparam: NetworkFilterList, - pub(crate) filters: NetworkFilterList, - pub(crate) generic_hide: NetworkFilterList, + pub(crate) memory: VerifiedFlatbufferMemory, // Enabled tags are not serialized - when deserializing, tags of the existing // instance (the one we are recreating lists into) are maintained pub(crate) tags_enabled: HashSet, - pub(crate) tagged_filters_all: NetworkFilterList, - - pub(crate) enable_optimizations: bool, - // Not serialized #[cfg(feature = "unsync-regex-caching")] pub(crate) regex_manager: std::cell::RefCell, #[cfg(not(feature = "unsync-regex-caching"))] pub(crate) regex_manager: std::sync::Mutex, + + pub(crate) shared_state: NetworkFilterSharedState, } impl Blocker { @@ -95,6 +102,46 @@ impl Blocker { self.check_parameterised(request, resources, false, false) } + pub(crate) fn get_list(&self, id: FilterId) -> NetworkFilterList { + // TODO: verify lists() size and id is in range + NetworkFilterList { + list: self.memory.root().lists().get(id as usize), + shared_state: &self.shared_state, + } + } + + pub(crate) fn csp(&self) -> NetworkFilterList { + self.get_list(FilterId::Csp) + } + + pub(crate) fn exceptions(&self) -> NetworkFilterList { + self.get_list(FilterId::Exceptions) + } + + pub(crate) fn importants(&self) -> NetworkFilterList { + self.get_list(FilterId::Importants) + } + + pub(crate) fn redirects(&self) -> NetworkFilterList { + self.get_list(FilterId::Redirects) + } + + pub(crate) fn removeparam(&self) -> NetworkFilterList { + self.get_list(FilterId::RemoveParam) + } + + pub(crate) fn filters(&self) -> NetworkFilterList { + self.get_list(FilterId::Filters) + } + + pub(crate) fn generic_hide(&self) -> NetworkFilterList { + self.get_list(FilterId::GenericHide) + } + + pub(crate) fn tagged_filters_all(&self) -> NetworkFilterList { + self.get_list(FilterId::TaggedFiltersAll) + } + #[cfg(feature = "unsync-regex-caching")] fn borrow_regex_manager(&self) -> std::cell::RefMut { #[allow(unused_mut)] @@ -115,7 +162,7 @@ impl Blocker { pub fn check_generic_hide(&self, hostname_request: &Request) -> bool { let mut regex_manager = self.borrow_regex_manager(); - self.generic_hide + self.generic_hide() .check(hostname_request, &HashSet::new(), &mut regex_manager) .is_some() } @@ -139,13 +186,15 @@ impl Blocker { // 4. exceptions - if any non-important match of forced // Always check important filters - let important_filter = self.importants.check(request, &NO_TAGS, &mut regex_manager); + let important_filter = self + .importants() + .check(request, &NO_TAGS, &mut regex_manager); // only check the rest of the rules if not previously matched let filter = if important_filter.is_none() && !matched_rule { - self.tagged_filters_all + self.tagged_filters_all() .check(request, &self.tags_enabled, &mut regex_manager) - .or_else(|| self.filters.check(request, &NO_TAGS, &mut regex_manager)) + .or_else(|| self.filters().check(request, &NO_TAGS, &mut regex_manager)) } else { important_filter }; @@ -153,19 +202,19 @@ impl Blocker { let exception = match filter.as_ref() { // if no other rule matches, only check exceptions if forced to None if matched_rule || force_check_exceptions => { - self.exceptions + self.exceptions() .check(request, &self.tags_enabled, &mut regex_manager) } None => None, // If matched an important filter, exceptions don't atter Some(f) if f.is_important() => None, Some(_) => self - .exceptions + .exceptions() .check(request, &self.tags_enabled, &mut regex_manager), }; let redirect_filters = - self.redirects + self.redirects() .check_all(request, &NO_TAGS, regex_manager.deref_mut()); // Extract the highest priority redirect directive. @@ -231,7 +280,7 @@ impl Blocker { let rewritten_url = if important { None } else { - Self::apply_removeparam(&self.removeparam, request, regex_manager.deref_mut()) + Self::apply_removeparam(&self.removeparam(), request, regex_manager.deref_mut()) }; // If something has already matched before but we don't know what, still return a match @@ -346,7 +395,7 @@ impl Blocker { let mut regex_manager = self.borrow_regex_manager(); let filters = self - .csp + .csp() .check_all(request, &self.tags_enabled, &mut regex_manager); if filters.is_empty() { @@ -390,94 +439,82 @@ impl Blocker { Some(merged) } - pub fn new(network_filters: Vec, options: &BlockerOptions) -> Self { - // Capacity of filter subsets estimated based on counts in EasyList and EasyPrivacy - if necessary - // the Vectors will grow beyond the pre-set capacity, but it is more efficient to allocate all at once - // $csp= - let mut csp = Vec::with_capacity(200); - // @@filter - let mut exceptions = Vec::with_capacity(network_filters.len() / 8); - // $important - let mut importants = Vec::with_capacity(200); - // $redirect, $redirect-rule - let mut redirects = Vec::with_capacity(200); - // $removeparam - let mut removeparam = Vec::with_capacity(60); - // $tag= - let mut tagged_filters_all = Vec::with_capacity(200); - // $badfilter - let mut badfilters = Vec::with_capacity(100); - // $generichide - let mut generic_hide = Vec::with_capacity(4000); - // All other filters - let mut filters = Vec::with_capacity(network_filters.len()); + pub(crate) fn from_verified_memory(memory: VerifiedFlatbufferMemory) -> Self { + // Reconstruct the unique_domains_hashes_map from the flatbuffer data + let root = memory.root(); + let mut unique_domains_hashes_map: HashMap = HashMap::new(); + for (index, hash) in root.unique_domains_hashes().iter().enumerate() { + unique_domains_hashes_map.insert(hash, index as u32); + } + + let shared_state = NetworkFilterSharedState { + unique_domains_hashes_map, + }; + Self { + tags_enabled: HashSet::new(), + regex_manager: Default::default(), + memory, + shared_state, + } + } + + pub fn new(mut network_filters: Vec, options: &BlockerOptions) -> Self { // Injections // TODO: resource handling - if !network_filters.is_empty() { - for filter in network_filters.iter() { - if filter.is_badfilter() { - badfilters.push(filter); - } - } - let badfilter_ids: HashSet = badfilters - .iter() - .map(|f| f.get_id_without_badfilter()) - .collect(); - for filter in network_filters { - // skip any bad filters - let filter_id = filter.get_id(); - if badfilter_ids.contains(&filter_id) || filter.is_badfilter() { - continue; - } + let mut builder = FlatBufferBuilder::new(FilterId::Size as usize); - // Redirects are independent of blocking behavior. - if filter.is_redirect() { - redirects.push(filter.clone()); - } + let mut badfilter_ids: HashSet = HashSet::new(); + for filter in network_filters.iter() { + if filter.is_badfilter() { + badfilter_ids.insert(filter.get_id_without_badfilter()); + } + } + for filter in network_filters.drain(..) { + // skip any bad filters + let filter_id = filter.get_id(); + if badfilter_ids.contains(&filter_id) || filter.is_badfilter() { + continue; + } - if filter.is_csp() { - csp.push(filter); - } else if filter.is_removeparam() { - removeparam.push(filter); - } else if filter.is_generic_hide() { - generic_hide.push(filter); - } else if filter.is_exception() { - exceptions.push(filter); - } else if filter.is_important() { - importants.push(filter); - } else if filter.tag.is_some() && !filter.is_redirect() { - // `tag` + `redirect` is unsupported for now. - tagged_filters_all.push(filter); - } else if (filter.is_redirect() && filter.also_block_redirect()) - || !filter.is_redirect() - { - filters.push(filter); - } + // Redirects are independent of blocking behavior. + if filter.is_redirect() { + builder.add_filter(filter.clone(), FilterId::Redirects as u32); } + + let list_id: FilterId = if filter.is_csp() { + FilterId::Csp + } else if filter.is_removeparam() { + FilterId::RemoveParam + } else if filter.is_generic_hide() { + FilterId::GenericHide + } else if filter.is_exception() { + FilterId::Exceptions + } else if filter.is_important() { + FilterId::Importants + } else if filter.tag.is_some() && !filter.is_redirect() { + // `tag` + `redirect` is unsupported for now. + FilterId::TaggedFiltersAll + } else if (filter.is_redirect() && filter.also_block_redirect()) + || !filter.is_redirect() + { + FilterId::Filters + } else { + continue; + }; + + builder.add_filter(filter, list_id as u32); } - Self { - csp: NetworkFilterList::new(csp, options.enable_optimizations), - exceptions: NetworkFilterList::new(exceptions, options.enable_optimizations), - importants: NetworkFilterList::new(importants, options.enable_optimizations), - redirects: NetworkFilterList::new(redirects, options.enable_optimizations), + let memory = builder.finish(if options.enable_optimizations { // Don't optimize removeparam, since it can fuse filters without respecting distinct - // queryparam values - removeparam: NetworkFilterList::new(removeparam, false), - filters: NetworkFilterList::new(filters, options.enable_optimizations), - generic_hide: NetworkFilterList::new(generic_hide, options.enable_optimizations), - // Tags special case for enabling/disabling them dynamically - tags_enabled: HashSet::new(), - tagged_filters_all: NetworkFilterList::new( - tagged_filters_all, - options.enable_optimizations, - ), - // Options - enable_optimizations: options.enable_optimizations, - regex_manager: Default::default(), - } + |id: u32| id != FilterId::RemoveParam as u32 + } else { + |_| false + }); + + Self::from_verified_memory(memory) } pub fn use_tags(&mut self, tags: &[&str]) { diff --git a/src/data_format/storage.rs b/src/data_format/storage.rs index ce4132d4..1b60301f 100644 --- a/src/data_format/storage.rs +++ b/src/data_format/storage.rs @@ -11,8 +11,7 @@ use serde::{Deserialize, Serialize}; use crate::blocker::Blocker; use crate::cosmetic_filter_cache::{CosmeticFilterCache, HostnameRuleDb, ProceduralOrActionFilter}; -use crate::filters::network::{NetworkFilter, NetworkFilterMaskHelper}; -use crate::network_filter_list::NetworkFilterList; +use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; use crate::utils::Hash; use super::utils::{stabilize_hashmap_serialization, stabilize_hashset_serialization}; @@ -182,93 +181,11 @@ pub(crate) struct LegacyScriptletResourceStorage { resources: HashMap, } -/// `_bug` is no longer used, and is removed from future format versions. -#[derive(Debug, Clone, Serialize)] -struct NetworkFilterSerializeFmt<'a> { - mask: &'a crate::filters::network::NetworkFilterMask, - filter: &'a crate::filters::network::FilterPart, - opt_domains: &'a Option>, - opt_not_domains: &'a Option>, - redirect: &'a Option, - hostname: &'a Option, - csp: &'a Option, - _bug: Option, - tag: &'a Option, - raw_line: Option, - id: &'a crate::utils::Hash, -} - -/// Generic over `Borrow` because `tagged_filters_all` requires `&'a NetworkFilter` -/// while `NetworkFilterList` requires `&'a Arc`. -impl<'a, T> From<&'a T> for NetworkFilterSerializeFmt<'a> -where - T: std::borrow::Borrow, -{ - fn from(v: &'a T) -> NetworkFilterSerializeFmt<'a> { - let v = v.borrow(); - NetworkFilterSerializeFmt { - mask: &v.mask, - filter: &v.filter, - opt_domains: &v.opt_domains, - opt_not_domains: &v.opt_not_domains, - redirect: if v.is_redirect() { - &v.modifier_option - } else { - &None - }, - hostname: &v.hostname, - csp: if v.is_csp() { - &v.modifier_option - } else { - &None - }, - _bug: None, - tag: &v.tag, - raw_line: v.raw_line.as_ref().map(|raw| *raw.clone()), - id: &v.id, - } - } -} - -/// Forces a `NetworkFilterList` to be serialized by converting to an -/// intermediate representation that is constructed with `NetworkFilterFmt` instead. -fn serialize_network_filter_list(list: &NetworkFilterList, s: S) -> Result -where - S: serde::Serializer, -{ - #[derive(Serialize, Default)] - struct NetworkFilterListSerializeFmt { - flatbuffer_memory: Vec, - } - - let storage_list = NetworkFilterListSerializeFmt { - flatbuffer_memory: list.memory.data().to_vec(), - }; - - storage_list.serialize(s) -} - /// Provides structural aggregration of referenced adblock engine data to allow for allocation-free /// serialization. #[derive(Serialize)] pub(crate) struct SerializeFormat<'a> { - #[serde(serialize_with = "serialize_network_filter_list")] - csp: &'a NetworkFilterList, - #[serde(serialize_with = "serialize_network_filter_list")] - exceptions: &'a NetworkFilterList, - #[serde(serialize_with = "serialize_network_filter_list")] - importants: &'a NetworkFilterList, - #[serde(serialize_with = "serialize_network_filter_list")] - redirects: &'a NetworkFilterList, - #[serde(serialize_with = "serialize_network_filter_list")] - filters: &'a NetworkFilterList, - #[serde(serialize_with = "serialize_network_filter_list")] - generic_hide: &'a NetworkFilterList, - - #[serde(serialize_with = "serialize_network_filter_list")] - tagged_filters_all: &'a NetworkFilterList, - - enable_optimizations: bool, + flatbuffer_memory: Vec, resources: LegacyRedirectResourceStorage, @@ -303,35 +220,11 @@ impl SerializeFormat<'_> { } } -#[derive(Debug, Deserialize, Default)] -pub(crate) struct NetworkFilterListDeserializeFmt { - pub flatbuffer_memory: Vec, -} - -impl TryFrom for NetworkFilterList { - fn try_from(v: NetworkFilterListDeserializeFmt) -> Result { - Ok(NetworkFilterList::try_from_unverified_memory( - v.flatbuffer_memory, - )?) - } - - type Error = DeserializationError; -} - /// Structural representation of adblock engine data that can be built up from deserialization and /// used directly to construct new `Engine` components without unnecessary allocation. #[derive(Deserialize)] pub(crate) struct DeserializeFormat { - csp: NetworkFilterListDeserializeFmt, - exceptions: NetworkFilterListDeserializeFmt, - importants: NetworkFilterListDeserializeFmt, - redirects: NetworkFilterListDeserializeFmt, - filters: NetworkFilterListDeserializeFmt, - generic_hide: NetworkFilterListDeserializeFmt, - - tagged_filters_all: NetworkFilterListDeserializeFmt, - - enable_optimizations: bool, + flatbuffer_memory: Vec, _resources: LegacyRedirectResourceStorage, @@ -364,16 +257,7 @@ impl<'a> From<(&'a Blocker, &'a CosmeticFilterCache)> for SerializeFormat<'a> { fn from(v: (&'a Blocker, &'a CosmeticFilterCache)) -> Self { let (blocker, cfc) = v; Self { - csp: &blocker.csp, - exceptions: &blocker.exceptions, - importants: &blocker.importants, - redirects: &blocker.redirects, - filters: &blocker.filters, - generic_hide: &blocker.generic_hide, - - tagged_filters_all: &blocker.tagged_filters_all, - - enable_optimizations: blocker.enable_optimizations, + flatbuffer_memory: blocker.memory.data().to_vec(), resources: LegacyRedirectResourceStorage::default(), @@ -403,22 +287,11 @@ impl TryFrom for (Blocker, CosmeticFilterCache) { specific_rules.procedural_action_exception = HostnameFilterBin(v.procedural_action_exception); + let memory = VerifiedFlatbufferMemory::from_raw(v.flatbuffer_memory) + .map_err(DeserializationError::FlatBufferParsingError)?; + Ok(( - Blocker { - csp: v.csp.try_into()?, - exceptions: v.exceptions.try_into()?, - importants: v.importants.try_into()?, - redirects: v.redirects.try_into()?, - removeparam: NetworkFilterList::default(), - filters: v.filters.try_into()?, - generic_hide: v.generic_hide.try_into()?, - - tags_enabled: Default::default(), - tagged_filters_all: v.tagged_filters_all.try_into()?, - - enable_optimizations: v.enable_optimizations, - regex_manager: Default::default(), - }, + Blocker::from_verified_memory(memory), CosmeticFilterCache { simple_class_rules: v.simple_class_rules, simple_id_rules: v.simple_id_rules, diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 9f5bcb45..3e1fd64e 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -1,164 +1,19 @@ //! Flatbuffer-compatible versions of [NetworkFilter] and related functionality. use std::collections::HashMap; -use std::vec; -use flatbuffers::WIPOffset; +use crate::filters::network::{NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable}; +use crate::filters::unsafe_tools::fb_vector_to_slice; -use crate::filters::network::{ - NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable, -}; -use crate::filters::unsafe_tools::{fb_vector_to_slice, VerifiedFlatFilterListMemory}; - -use crate::network_filter_list::NetworkFilterList; use crate::regex_manager::RegexManager; use crate::request::Request; -use crate::utils::{Hash, ShortHash}; +use crate::utils::Hash; #[allow(dead_code, clippy::all, unused_imports, unsafe_code)] #[path = "../flatbuffers/fb_network_filter_generated.rs"] pub mod flat; use flat::fb; -/// Builder for [NetworkFilterList]. -pub(crate) struct FlatNetworkFiltersListBuilder<'a> { - builder: flatbuffers::FlatBufferBuilder<'a>, - filters: Vec>>, - - unique_domains_hashes: Vec, - unique_domains_hashes_map: HashMap, -} - -impl FlatNetworkFiltersListBuilder<'_> { - pub fn new() -> Self { - Self { - builder: flatbuffers::FlatBufferBuilder::new(), - filters: vec![], - unique_domains_hashes: vec![], - unique_domains_hashes_map: HashMap::new(), - } - } - - fn get_or_insert_unique_domain_hash(&mut self, h: &Hash) -> u32 { - if let Some(&index) = self.unique_domains_hashes_map.get(h) { - return index; - } - let index = self.unique_domains_hashes.len() as u32; - self.unique_domains_hashes.push(*h); - self.unique_domains_hashes_map.insert(*h, index); - index - } - - pub fn add(&mut self, network_filter: &NetworkFilter) -> u32 { - let opt_domains = network_filter.opt_domains.as_ref().map(|v| { - let mut o: Vec = v - .iter() - .map(|x| self.get_or_insert_unique_domain_hash(x)) - .collect(); - o.sort_unstable(); - o.dedup(); - self.builder.create_vector(&o) - }); - - let opt_not_domains = network_filter.opt_not_domains.as_ref().map(|v| { - let mut o: Vec = v - .iter() - .map(|x| self.get_or_insert_unique_domain_hash(x)) - .collect(); - o.sort_unstable(); - o.dedup(); - self.builder.create_vector(&o) - }); - - let modifier_option = network_filter - .modifier_option - .as_ref() - .map(|s| self.builder.create_string(s)); - - let hostname = network_filter - .hostname - .as_ref() - .map(|s| self.builder.create_string(s)); - - let tag = network_filter - .tag - .as_ref() - .map(|s| self.builder.create_string(s)); - - let patterns = if network_filter.filter.iter().len() > 0 { - let offsets: Vec> = network_filter - .filter - .iter() - .map(|s| self.builder.create_string(s)) - .collect(); - Some(self.builder.create_vector(&offsets)) - } else { - None - }; - - let raw_line = network_filter - .raw_line - .as_ref() - .map(|v| self.builder.create_string(v.as_str())); - - let filter = fb::NetworkFilter::create( - &mut self.builder, - &fb::NetworkFilterArgs { - mask: network_filter.mask.bits(), - patterns, - modifier_option, - opt_domains, - opt_not_domains, - hostname, - tag, - raw_line, - }, - ); - - self.filters.push(filter); - u32::try_from(self.filters.len() - 1).expect("< u32::MAX") - } - - pub fn finish( - &mut self, - mut filter_map: HashMap>, - ) -> VerifiedFlatFilterListMemory { - let unique_domains_hashes = self.builder.create_vector(&self.unique_domains_hashes); - - let len = filter_map.len(); - - // Convert filter_map keys to a sorted vector of (hash, filter_indices). - let mut entries: Vec<_> = filter_map.drain().collect(); - entries.sort_unstable_by_key(|(k, _)| *k); - - // Convert sorted_entries to two flatbuffers vectors. - let mut flat_index: Vec = Vec::with_capacity(len); - let mut flat_values: Vec<_> = Vec::with_capacity(len); - for (key, filter_indices) in entries { - for &filter_index in &filter_indices { - flat_index.push(key); - flat_values.push(self.filters[filter_index as usize]); - } - } - - let filter_map_index = self.builder.create_vector(&flat_index); - let filter_map_values = self.builder.create_vector(&flat_values); - - let storage = fb::NetworkFilterList::create( - &mut self.builder, - &fb::NetworkFilterListArgs { - filter_map_index: Some(filter_map_index), - filter_map_values: Some(filter_map_values), - unique_domains_hashes: Some(unique_domains_hashes), - }, - ); - self.builder.finish(storage, None); - - // TODO: consider using builder.collapse() to avoid reallocating memory. - VerifiedFlatFilterListMemory::from_builder(&self.builder) - } -} - /// A list of string parts that can be matched against a URL. pub(crate) struct FlatPatterns<'a> { patterns: Option>>, @@ -212,10 +67,15 @@ impl ExactSizeIterator for FlatPatternsIterator<'_> { } } +#[derive(Debug, Default)] +pub(crate) struct NetworkFilterSharedState { + pub(crate) unique_domains_hashes_map: HashMap, +} + /// Internal implementation of [NetworkFilter] that is compatible with flatbuffers. pub(crate) struct FlatNetworkFilter<'a> { key: u64, - owner: &'a NetworkFilterList, + shared_state: &'a NetworkFilterSharedState, fb_filter: &'a fb::NetworkFilter<'a>, pub(crate) mask: NetworkFilterMask, @@ -226,15 +86,13 @@ impl<'a> FlatNetworkFilter<'a> { pub fn new( filter: &'a fb::NetworkFilter<'a>, index: usize, - owner: &'a NetworkFilterList, + shared_state: &'a NetworkFilterSharedState, ) -> Self { - let list_address: *const NetworkFilterList = owner as *const NetworkFilterList; - Self { fb_filter: filter, - key: index as u64 | (((list_address) as u64) << 32), + key: index as u64, mask: NetworkFilterMask::from_bits_retain(filter.mask()), - owner, + shared_state, } } @@ -301,14 +159,14 @@ impl NetworkMatchable for FlatNetworkFilter<'_> { if !check_included_domains_mapped( self.include_domains(), request, - &self.owner.unique_domains_hashes_map, + &self.shared_state.unique_domains_hashes_map, ) { return false; } if !check_excluded_domains_mapped( self.exclude_domains(), request, - &self.owner.unique_domains_hashes_map, + &self.shared_state.unique_domains_hashes_map, ) { return false; } diff --git a/src/filters/flat_builder.rs b/src/filters/flat_builder.rs new file mode 100644 index 00000000..af63f384 --- /dev/null +++ b/src/filters/flat_builder.rs @@ -0,0 +1,264 @@ +//! Builder for creating flatbuffer-compatible Engine. + +use std::collections::HashMap; +use std::vec; + +use flatbuffers::WIPOffset; + +use crate::filters::network::NetworkFilter; +use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; +use crate::network_filter_list::token_histogram; +use crate::optimizer; +use crate::utils::{to_short_hash, Hash, ShortHash}; + +use super::fb_network::flat::fb; + +#[derive(Default, Clone)] +struct FilterListBuilder { + filters: Vec, +} + +pub(crate) struct FlatBufferBuilder { + lists: Vec, + + unique_domains_hashes: Vec, + unique_domains_hashes_map: HashMap, + index: u32, +} + +impl FlatBufferBuilder { + pub fn new(list_count: usize) -> Self { + Self { + lists: vec![FilterListBuilder::default(); list_count], + unique_domains_hashes: vec![], + unique_domains_hashes_map: HashMap::new(), + index: 0, + } + } + + fn get_or_insert_unique_domain_hash(&mut self, h: &Hash) -> u32 { + if let Some(&index) = self.unique_domains_hashes_map.get(h) { + return index; + } + let index = self.unique_domains_hashes.len() as u32; + self.unique_domains_hashes.push(*h); + self.unique_domains_hashes_map.insert(*h, index); + index + } + + pub fn add_filter(&mut self, network_filter: NetworkFilter, list_id: u32) { + self.lists[list_id as usize].filters.push(network_filter); + } + + fn write_filter<'a>( + &mut self, + builder: &mut flatbuffers::FlatBufferBuilder<'a>, + network_filter: &NetworkFilter, + ) -> WIPOffset> { + let opt_domains = network_filter.opt_domains.as_ref().map(|v| { + let mut o: Vec = v + .iter() + .map(|x| self.get_or_insert_unique_domain_hash(x)) + .collect(); + o.sort_unstable(); + o.dedup(); + builder.create_vector(&o) + }); + + let opt_not_domains = network_filter.opt_not_domains.as_ref().map(|v| { + let mut o: Vec = v + .iter() + .map(|x| self.get_or_insert_unique_domain_hash(x)) + .collect(); + o.sort_unstable(); + o.dedup(); + builder.create_vector(&o) + }); + + let modifier_option = network_filter + .modifier_option + .as_ref() + .map(|s| builder.create_string(s)); + + let hostname = network_filter + .hostname + .as_ref() + .map(|s| builder.create_string(s)); + + let tag = network_filter + .tag + .as_ref() + .map(|s| builder.create_string(s)); + + let patterns = if network_filter.filter.iter().len() > 0 { + let offsets: Vec> = network_filter + .filter + .iter() + .map(|s| builder.create_string(s)) + .collect(); + Some(builder.create_vector(&offsets)) + } else { + None + }; + + let raw_line = network_filter + .raw_line + .as_ref() + .map(|v| builder.create_string(v.as_str())); + + let filter = fb::NetworkFilter::create( + builder, + &fb::NetworkFilterArgs { + mask: network_filter.mask.bits(), + patterns, + modifier_option, + opt_domains, + opt_not_domains, + hostname, + tag, + raw_line, + }, + ); + + self.index += 1; + + filter + } + + pub fn finish(&mut self, should_optimize: fn(u32) -> bool) -> VerifiedFlatbufferMemory { + let mut builder = flatbuffers::FlatBufferBuilder::new(); + let mut flat_lists = vec![]; + let mut lists = self.lists.drain(..).collect::>(); + for (list_id, list) in lists.drain(..).enumerate() { + let optimize = should_optimize(list_id as u32); + flat_lists.push(self.write_filter_list(&mut builder, list.filters, optimize)); + } + + // Create vectors first to avoid simultaneous mutable borrows of `builder`. + let lists_vec = builder.create_vector(&flat_lists); + let unique_vec = builder.create_vector(&self.unique_domains_hashes); + + let root = fb::Engine::create( + &mut builder, + &fb::EngineArgs { + lists: Some(lists_vec), + unique_domains_hashes: Some(unique_vec), + }, + ); + + builder.finish(root, None); + + // TODO: consider using builder.collapse() to avoid reallocating memory. + VerifiedFlatbufferMemory::from_builder(&builder) + } + + pub fn write_filter_list<'a>( + &mut self, + builder: &mut flatbuffers::FlatBufferBuilder<'a>, + filters: Vec, + optimize: bool, + ) -> WIPOffset> { + let mut filter_map = HashMap::>>>::new(); + + let mut optimizable = HashMap::>::new(); + + // Compute tokens for all filters + let filter_tokens: Vec<_> = filters + .into_iter() + .map(|filter| { + let tokens = filter.get_tokens(); + (filter, tokens) + }) + .collect(); + + // compute the tokens' frequency histogram + let (total_number_of_tokens, tokens_histogram) = token_histogram(&filter_tokens); + + { + for (network_filter, multi_tokens) in filter_tokens { + let flat_filter = if !optimize + || !optimizer::is_filter_optimizable_by_patterns(&network_filter) + { + Some(self.write_filter(builder, &network_filter)) + } else { + None + }; + + for tokens in multi_tokens { + let mut best_token: ShortHash = 0; + let mut min_count = total_number_of_tokens + 1; + for token in tokens { + let token = to_short_hash(token); + match tokens_histogram.get(&token) { + None => { + min_count = 0; + best_token = token + } + Some(&count) if count < min_count => { + min_count = count; + best_token = token + } + _ => {} + } + } + + if let Some(flat_filter) = flat_filter { + filter_map.entry(best_token).or_default().push(flat_filter); + } else { + optimizable + .entry(best_token) + .or_default() + .push(network_filter.clone()); + } + } // tokens + } + } + + if optimize { + // Sort the entries to ensure deterministic iteration order + let mut optimizable_entries: Vec<_> = optimizable.drain().collect(); + optimizable_entries.sort_unstable_by_key(|(token, _)| *token); + + for (token, v) in optimizable_entries { + let optimized = optimizer::optimize(v); + + for filter in optimized { + let flat_filter = self.write_filter(builder, &filter); + filter_map.entry(token).or_default().push(flat_filter); + } + } + } else { + debug_assert!( + optimizable.is_empty(), + "Should be empty if optimization is off" + ); + } + + let len = filter_map.len(); + + // Convert filter_map keys to a sorted vector of (hash, filter_indices). + let mut entries: Vec<_> = filter_map.drain().collect(); + entries.sort_unstable_by_key(|(k, _)| *k); + + // Convert sorted_entries to two flatbuffers vectors. + let mut flat_index: Vec = Vec::with_capacity(len); + let mut flat_values: Vec<_> = Vec::with_capacity(len); + for (key, filter_indices) in entries { + for &filter_index in &filter_indices { + flat_index.push(key); + flat_values.push(filter_index); + } + } + + let filter_map_index = builder.create_vector(&flat_index); + let filter_map_values = builder.create_vector(&flat_values); + + fb::NetworkFilterList::create( + builder, + &fb::NetworkFilterListArgs { + filter_map_index: Some(filter_map_index), + filter_map_values: Some(filter_map_values), + }, + ) + } +} diff --git a/src/filters/mod.rs b/src/filters/mod.rs index 1e25e7ad..21c0d0de 100644 --- a/src/filters/mod.rs +++ b/src/filters/mod.rs @@ -5,6 +5,7 @@ mod network_matchers; pub mod cosmetic; pub(crate) mod fb_network; +pub(crate) mod flat_builder; pub(crate) mod flat_filter_map; pub mod network; pub(crate) mod unsafe_tools; diff --git a/src/filters/unsafe_tools.rs b/src/filters/unsafe_tools.rs index 83ae9daf..f53edaa5 100644 --- a/src/filters/unsafe_tools.rs +++ b/src/filters/unsafe_tools.rs @@ -36,10 +36,10 @@ pub fn fb_vector_to_slice(vector: flatbuffers::Vector<'_, T>) -> &[T] { // It could be constructed from raw data (includes the flatbuffer verification) // or from a builder that have just been used to construct the flatbuffer // Invariants: -// 1. self.data() is properly verified flatbuffer contains FilterList. +// 1. self.data() is properly verified flatbuffer contains the root object. // 2. self.data() is aligned to MIN_ALIGNMENT bytes. // This is necessary for fb_vector_to_slice. -pub(crate) struct VerifiedFlatFilterListMemory { +pub(crate) struct VerifiedFlatbufferMemory { // The buffer containing the flatbuffer data. raw_data: Vec, @@ -48,17 +48,17 @@ pub(crate) struct VerifiedFlatFilterListMemory { start: usize, } -impl VerifiedFlatFilterListMemory { +impl VerifiedFlatbufferMemory { pub(crate) fn from_raw(data: Vec) -> Result { let memory = Self::from_vec(data); // Verify that the data is a valid flatbuffer. - let _ = fb::root_as_network_filter_list(memory.data())?; + let _ = fb::root_as_engine(memory.data())?; Ok(memory) } - // Creates a new VerifiedFlatFilterListMemory from a builder. + // Creates a new VerifiedFlatbufferMemory from a builder. // Skip the verification, the builder must contains a valid FilterList. pub(crate) fn from_builder(builder: &flatbuffers::FlatBufferBuilder<'_>) -> Self { let raw_data = builder.finished_data().to_vec(); @@ -86,8 +86,8 @@ impl VerifiedFlatFilterListMemory { memory } - pub(crate) fn filter_list(&self) -> fb::NetworkFilterList<'_> { - unsafe { fb::root_as_network_filter_list_unchecked(self.data()) } + pub(crate) fn root(&self) -> fb::Engine<'_> { + unsafe { fb::root_as_engine_unchecked(self.data()) } } pub fn data(&self) -> &[u8] { diff --git a/src/flatbuffers/fb_network_filter.fbs b/src/flatbuffers/fb_network_filter.fbs index cc5d0eb8..19649a13 100644 --- a/src/flatbuffers/fb_network_filter.fbs +++ b/src/flatbuffers/fb_network_filter.fbs @@ -27,7 +27,11 @@ table NetworkFilter { table NetworkFilterList { filter_map_index: [uint32] (required); filter_map_values: [NetworkFilter] (required); +} + +table Engine { + lists: [NetworkFilterList] (required); unique_domains_hashes: [uint64] (required); } -root_type NetworkFilterList; +root_type Engine; diff --git a/src/flatbuffers/fb_network_filter_generated.rs b/src/flatbuffers/fb_network_filter_generated.rs index 5dd1757d..b290c958 100644 --- a/src/flatbuffers/fb_network_filter_generated.rs +++ b/src/flatbuffers/fb_network_filter_generated.rs @@ -446,7 +446,6 @@ pub mod fb { impl<'a> NetworkFilterList<'a> { pub const VT_FILTER_MAP_INDEX: flatbuffers::VOffsetT = 4; pub const VT_FILTER_MAP_VALUES: flatbuffers::VOffsetT = 6; - pub const VT_UNIQUE_DOMAINS_HASHES: flatbuffers::VOffsetT = 8; #[inline] pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { @@ -463,9 +462,6 @@ pub mod fb { args: &'args NetworkFilterListArgs<'args>, ) -> flatbuffers::WIPOffset> { let mut builder = NetworkFilterListBuilder::new(_fbb); - if let Some(x) = args.unique_domains_hashes { - builder.add_unique_domains_hashes(x); - } if let Some(x) = args.filter_map_values { builder.add_filter_map_values(x); } @@ -484,14 +480,9 @@ pub mod fb { let x = self.filter_map_values(); x.iter().map(|t| t.unpack()).collect() }; - let unique_domains_hashes = { - let x = self.unique_domains_hashes(); - x.into_iter().collect() - }; NetworkFilterListT { filter_map_index, filter_map_values, - unique_domains_hashes, } } @@ -524,20 +515,6 @@ pub mod fb { .unwrap() } } - #[inline] - pub fn unique_domains_hashes(&self) -> flatbuffers::Vector<'a, u64> { - // Safety: - // Created from valid Table for this object - // which contains a valid value in this slot - unsafe { - self._tab - .get::>>( - NetworkFilterList::VT_UNIQUE_DOMAINS_HASHES, - None, - ) - .unwrap() - } - } } impl flatbuffers::Verifiable for NetworkFilterList<'_> { @@ -556,11 +533,6 @@ pub mod fb { .visit_field::>, >>("filter_map_values", Self::VT_FILTER_MAP_VALUES, true)? - .visit_field::>>( - "unique_domains_hashes", - Self::VT_UNIQUE_DOMAINS_HASHES, - true, - )? .finish(); Ok(()) } @@ -572,15 +544,13 @@ pub mod fb { flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, >, >, - pub unique_domains_hashes: Option>>, } impl<'a> Default for NetworkFilterListArgs<'a> { #[inline] fn default() -> Self { NetworkFilterListArgs { - filter_map_index: None, // required field - filter_map_values: None, // required field - unique_domains_hashes: None, // required field + filter_map_index: None, // required field + filter_map_values: None, // required field } } } @@ -613,16 +583,6 @@ pub mod fb { ); } #[inline] - pub fn add_unique_domains_hashes( - &mut self, - unique_domains_hashes: flatbuffers::WIPOffset>, - ) { - self.fbb_.push_slot_always::>( - NetworkFilterList::VT_UNIQUE_DOMAINS_HASHES, - unique_domains_hashes, - ); - } - #[inline] pub fn new( _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, ) -> NetworkFilterListBuilder<'a, 'b, A> { @@ -645,11 +605,6 @@ pub mod fb { NetworkFilterList::VT_FILTER_MAP_VALUES, "filter_map_values", ); - self.fbb_.required( - o, - NetworkFilterList::VT_UNIQUE_DOMAINS_HASHES, - "unique_domains_hashes", - ); flatbuffers::WIPOffset::new(o.value()) } } @@ -659,7 +614,6 @@ pub mod fb { let mut ds = f.debug_struct("NetworkFilterList"); ds.field("filter_map_index", &self.filter_map_index()); ds.field("filter_map_values", &self.filter_map_values()); - ds.field("unique_domains_hashes", &self.unique_domains_hashes()); ds.finish() } } @@ -668,14 +622,12 @@ pub mod fb { pub struct NetworkFilterListT { pub filter_map_index: Vec, pub filter_map_values: Vec, - pub unique_domains_hashes: Vec, } impl Default for NetworkFilterListT { fn default() -> Self { Self { filter_map_index: Default::default(), filter_map_values: Default::default(), - unique_domains_hashes: Default::default(), } } } @@ -693,102 +645,308 @@ pub mod fb { let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect(); _fbb.create_vector(&w) }); - let unique_domains_hashes = Some({ - let x = &self.unique_domains_hashes; - _fbb.create_vector(x) - }); NetworkFilterList::create( _fbb, &NetworkFilterListArgs { filter_map_index, filter_map_values, + }, + ) + } + } + pub enum EngineOffset {} + #[derive(Copy, Clone, PartialEq)] + + pub struct Engine<'a> { + pub _tab: flatbuffers::Table<'a>, + } + + impl<'a> flatbuffers::Follow<'a> for Engine<'a> { + type Inner = Engine<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } + } + + impl<'a> Engine<'a> { + pub const VT_LISTS: flatbuffers::VOffsetT = 4; + pub const VT_UNIQUE_DOMAINS_HASHES: flatbuffers::VOffsetT = 6; + + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + Engine { _tab: table } + } + #[allow(unused_mut)] + pub fn create< + 'bldr: 'args, + 'args: 'mut_bldr, + 'mut_bldr, + A: flatbuffers::Allocator + 'bldr, + >( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr, A>, + args: &'args EngineArgs<'args>, + ) -> flatbuffers::WIPOffset> { + let mut builder = EngineBuilder::new(_fbb); + if let Some(x) = args.unique_domains_hashes { + builder.add_unique_domains_hashes(x); + } + if let Some(x) = args.lists { + builder.add_lists(x); + } + builder.finish() + } + + pub fn unpack(&self) -> EngineT { + let lists = { + let x = self.lists(); + x.iter().map(|t| t.unpack()).collect() + }; + let unique_domains_hashes = { + let x = self.unique_domains_hashes(); + x.into_iter().collect() + }; + EngineT { + lists, + unique_domains_hashes, + } + } + + #[inline] + pub fn lists( + &self, + ) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>, + >>(Engine::VT_LISTS, None) + .unwrap() + } + } + #[inline] + pub fn unique_domains_hashes(&self) -> flatbuffers::Vector<'a, u64> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Engine::VT_UNIQUE_DOMAINS_HASHES, + None, + ) + .unwrap() + } + } + } + + impl flatbuffers::Verifiable for Engine<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use self::flatbuffers::Verifiable; + v.visit_table(pos)? + .visit_field::>, + >>("lists", Self::VT_LISTS, true)? + .visit_field::>>( + "unique_domains_hashes", + Self::VT_UNIQUE_DOMAINS_HASHES, + true, + )? + .finish(); + Ok(()) + } + } + pub struct EngineArgs<'a> { + pub lists: Option< + flatbuffers::WIPOffset< + flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, + >, + >, + pub unique_domains_hashes: Option>>, + } + impl<'a> Default for EngineArgs<'a> { + #[inline] + fn default() -> Self { + EngineArgs { + lists: None, // required field + unique_domains_hashes: None, // required field + } + } + } + + pub struct EngineBuilder<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, + start_: flatbuffers::WIPOffset, + } + impl<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> EngineBuilder<'a, 'b, A> { + #[inline] + pub fn add_lists( + &mut self, + lists: flatbuffers::WIPOffset< + flatbuffers::Vector<'b, flatbuffers::ForwardsUOffset>>, + >, + ) { + self.fbb_ + .push_slot_always::>(Engine::VT_LISTS, lists); + } + #[inline] + pub fn add_unique_domains_hashes( + &mut self, + unique_domains_hashes: flatbuffers::WIPOffset>, + ) { + self.fbb_.push_slot_always::>( + Engine::VT_UNIQUE_DOMAINS_HASHES, + unique_domains_hashes, + ); + } + #[inline] + pub fn new( + _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, + ) -> EngineBuilder<'a, 'b, A> { + let start = _fbb.start_table(); + EngineBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + self.fbb_.required(o, Engine::VT_LISTS, "lists"); + self.fbb_ + .required(o, Engine::VT_UNIQUE_DOMAINS_HASHES, "unique_domains_hashes"); + flatbuffers::WIPOffset::new(o.value()) + } + } + + impl core::fmt::Debug for Engine<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("Engine"); + ds.field("lists", &self.lists()); + ds.field("unique_domains_hashes", &self.unique_domains_hashes()); + ds.finish() + } + } + #[non_exhaustive] + #[derive(Debug, Clone, PartialEq)] + pub struct EngineT { + pub lists: Vec, + pub unique_domains_hashes: Vec, + } + impl Default for EngineT { + fn default() -> Self { + Self { + lists: Default::default(), + unique_domains_hashes: Default::default(), + } + } + } + impl EngineT { + pub fn pack<'b, A: flatbuffers::Allocator + 'b>( + &self, + _fbb: &mut flatbuffers::FlatBufferBuilder<'b, A>, + ) -> flatbuffers::WIPOffset> { + let lists = Some({ + let x = &self.lists; + let w: Vec<_> = x.iter().map(|t| t.pack(_fbb)).collect(); + _fbb.create_vector(&w) + }); + let unique_domains_hashes = Some({ + let x = &self.unique_domains_hashes; + _fbb.create_vector(x) + }); + Engine::create( + _fbb, + &EngineArgs { + lists, unique_domains_hashes, }, ) } } #[inline] - /// Verifies that a buffer of bytes contains a `NetworkFilterList` + /// Verifies that a buffer of bytes contains a `Engine` /// and returns it. /// Note that verification is still experimental and may not /// catch every error, or be maximally performant. For the /// previous, unchecked, behavior use - /// `root_as_network_filter_list_unchecked`. - pub fn root_as_network_filter_list( - buf: &[u8], - ) -> Result { - flatbuffers::root::(buf) + /// `root_as_engine_unchecked`. + pub fn root_as_engine(buf: &[u8]) -> Result { + flatbuffers::root::(buf) } #[inline] /// Verifies that a buffer of bytes contains a size prefixed - /// `NetworkFilterList` and returns it. + /// `Engine` and returns it. /// Note that verification is still experimental and may not /// catch every error, or be maximally performant. For the /// previous, unchecked, behavior use - /// `size_prefixed_root_as_network_filter_list_unchecked`. - pub fn size_prefixed_root_as_network_filter_list( + /// `size_prefixed_root_as_engine_unchecked`. + pub fn size_prefixed_root_as_engine( buf: &[u8], - ) -> Result { - flatbuffers::size_prefixed_root::(buf) + ) -> Result { + flatbuffers::size_prefixed_root::(buf) } #[inline] /// Verifies, with the given options, that a buffer of bytes - /// contains a `NetworkFilterList` and returns it. + /// contains a `Engine` and returns it. /// Note that verification is still experimental and may not /// catch every error, or be maximally performant. For the /// previous, unchecked, behavior use - /// `root_as_network_filter_list_unchecked`. - pub fn root_as_network_filter_list_with_opts<'b, 'o>( + /// `root_as_engine_unchecked`. + pub fn root_as_engine_with_opts<'b, 'o>( opts: &'o flatbuffers::VerifierOptions, buf: &'b [u8], - ) -> Result, flatbuffers::InvalidFlatbuffer> { - flatbuffers::root_with_opts::>(opts, buf) + ) -> Result, flatbuffers::InvalidFlatbuffer> { + flatbuffers::root_with_opts::>(opts, buf) } #[inline] /// Verifies, with the given verifier options, that a buffer of - /// bytes contains a size prefixed `NetworkFilterList` and returns + /// bytes contains a size prefixed `Engine` and returns /// it. Note that verification is still experimental and may not /// catch every error, or be maximally performant. For the /// previous, unchecked, behavior use - /// `root_as_network_filter_list_unchecked`. - pub fn size_prefixed_root_as_network_filter_list_with_opts<'b, 'o>( + /// `root_as_engine_unchecked`. + pub fn size_prefixed_root_as_engine_with_opts<'b, 'o>( opts: &'o flatbuffers::VerifierOptions, buf: &'b [u8], - ) -> Result, flatbuffers::InvalidFlatbuffer> { - flatbuffers::size_prefixed_root_with_opts::>(opts, buf) + ) -> Result, flatbuffers::InvalidFlatbuffer> { + flatbuffers::size_prefixed_root_with_opts::>(opts, buf) } #[inline] - /// Assumes, without verification, that a buffer of bytes contains a NetworkFilterList and returns it. + /// Assumes, without verification, that a buffer of bytes contains a Engine and returns it. /// # Safety - /// Callers must trust the given bytes do indeed contain a valid `NetworkFilterList`. - pub unsafe fn root_as_network_filter_list_unchecked(buf: &[u8]) -> NetworkFilterList { - flatbuffers::root_unchecked::(buf) + /// Callers must trust the given bytes do indeed contain a valid `Engine`. + pub unsafe fn root_as_engine_unchecked(buf: &[u8]) -> Engine { + flatbuffers::root_unchecked::(buf) } #[inline] - /// Assumes, without verification, that a buffer of bytes contains a size prefixed NetworkFilterList and returns it. + /// Assumes, without verification, that a buffer of bytes contains a size prefixed Engine and returns it. /// # Safety - /// Callers must trust the given bytes do indeed contain a valid size prefixed `NetworkFilterList`. - pub unsafe fn size_prefixed_root_as_network_filter_list_unchecked( - buf: &[u8], - ) -> NetworkFilterList { - flatbuffers::size_prefixed_root_unchecked::(buf) + /// Callers must trust the given bytes do indeed contain a valid size prefixed `Engine`. + pub unsafe fn size_prefixed_root_as_engine_unchecked(buf: &[u8]) -> Engine { + flatbuffers::size_prefixed_root_unchecked::(buf) } #[inline] - pub fn finish_network_filter_list_buffer<'a, 'b, A: flatbuffers::Allocator + 'a>( + pub fn finish_engine_buffer<'a, 'b, A: flatbuffers::Allocator + 'a>( fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, - root: flatbuffers::WIPOffset>, + root: flatbuffers::WIPOffset>, ) { fbb.finish(root, None); } #[inline] - pub fn finish_size_prefixed_network_filter_list_buffer< - 'a, - 'b, - A: flatbuffers::Allocator + 'a, - >( + pub fn finish_size_prefixed_engine_buffer<'a, 'b, A: flatbuffers::Allocator + 'a>( fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, - root: flatbuffers::WIPOffset>, + root: flatbuffers::WIPOffset>, ) { fbb.finish_size_prefixed(root, None); } diff --git a/src/network_filter_list.rs b/src/network_filter_list.rs index 0bde86e2..e80704cd 100644 --- a/src/network_filter_list.rs +++ b/src/network_filter_list.rs @@ -3,13 +3,12 @@ use std::{collections::HashMap, collections::HashSet, fmt}; use crate::filters::fb_network::flat::fb; -use crate::filters::fb_network::{FlatNetworkFilter, FlatNetworkFiltersListBuilder}; +use crate::filters::fb_network::{FlatNetworkFilter, NetworkFilterSharedState}; use crate::filters::flat_filter_map::FlatFilterMap; use crate::filters::network::{ NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable, }; -use crate::filters::unsafe_tools::{fb_vector_to_slice, VerifiedFlatFilterListMemory}; -use crate::optimizer; +use crate::filters::unsafe_tools::fb_vector_to_slice; use crate::regex_manager::RegexManager; use crate::request::Request; use crate::utils::{fast_hash, to_short_hash, Hash, ShortHash}; @@ -56,141 +55,20 @@ pub enum NetworkFilterListParsingError { } /// Internal structure to keep track of a collection of network filters. -pub(crate) struct NetworkFilterList { - pub(crate) memory: VerifiedFlatFilterListMemory, - pub(crate) unique_domains_hashes_map: HashMap, +pub(crate) struct NetworkFilterList<'a> { + pub(crate) list: fb::NetworkFilterList<'a>, + pub(crate) shared_state: &'a NetworkFilterSharedState, } -impl Default for NetworkFilterList { - fn default() -> Self { - let mut builder = FlatNetworkFiltersListBuilder::new(); - let memory = builder.finish(HashMap::new()); - Self { - memory, - unique_domains_hashes_map: HashMap::new(), - } - } -} - -impl NetworkFilterList { - /// Create a new [NetworkFilterList] from raw memory (includes verification). - pub(crate) fn try_from_unverified_memory( - flatbuffer_memory: Vec, - ) -> Result { - let memory = VerifiedFlatFilterListMemory::from_raw(flatbuffer_memory) - .map_err(NetworkFilterListParsingError::InvalidFlatbuffer)?; - - Self::try_from_verified_memory(memory) - } - - pub(crate) fn try_from_verified_memory( - memory: VerifiedFlatFilterListMemory, - ) -> Result { - let root = memory.filter_list(); - - // Reconstruct the unique_domains_hashes_map from the flatbuffer data - let len = root.unique_domains_hashes().len(); - let mut unique_domains_hashes_map: HashMap = - HashMap::with_capacity(len); - for (index, hash) in root.unique_domains_hashes().iter().enumerate() { - unique_domains_hashes_map.insert( - hash, - u32::try_from(index) - .map_err(|_| NetworkFilterListParsingError::UniqueDomainsOutOfBounds(index))?, - ); - } - - Ok(Self { - memory, - unique_domains_hashes_map, - }) - } - +impl NetworkFilterList<'_> { pub fn get_filter_map(&self) -> FlatFilterMap { - let filters_list = self.memory.filter_list(); + let filters_list = &self.list; FlatFilterMap::new( fb_vector_to_slice(filters_list.filter_map_index()), filters_list.filter_map_values(), ) } - pub fn new(filters: Vec, optimize: bool) -> Self { - // Compute tokens for all filters - let filter_tokens: Vec<_> = filters - .into_iter() - .map(|filter| { - let tokens = filter.get_tokens(); - (filter, tokens) - }) - .collect(); - // compute the tokens' frequency histogram - let (total_number_of_tokens, tokens_histogram) = token_histogram(&filter_tokens); - - let mut flat_builder = FlatNetworkFiltersListBuilder::new(); - let mut filter_map = HashMap::>::new(); - - let mut optimizable = HashMap::>::new(); - { - for (network_filter, multi_tokens) in filter_tokens { - let index = if !optimize - || !optimizer::is_filter_optimizable_by_patterns(&network_filter) - { - Some(flat_builder.add(&network_filter)) - } else { - None - }; - - for tokens in multi_tokens { - let mut best_token: ShortHash = 0; - let mut min_count = total_number_of_tokens + 1; - for token in tokens { - let token = to_short_hash(token); - match tokens_histogram.get(&token) { - None => { - min_count = 0; - best_token = token - } - Some(&count) if count < min_count => { - min_count = count; - best_token = token - } - _ => {} - } - } - if let Some(index) = index { - insert_dup(&mut filter_map, best_token, index); - } else { - insert_dup(&mut optimizable, best_token, network_filter.clone()); - } - } // tokens - } - } - - if optimize { - // Sort the entries to ensure deterministic iteration order - let mut optimizable_entries: Vec<_> = optimizable.drain().collect(); - optimizable_entries.sort_unstable_by_key(|(token, _)| *token); - - for (token, v) in optimizable_entries { - let optimized = optimizer::optimize(v); - - for filter in optimized { - let index = flat_builder.add(&filter); - insert_dup(&mut filter_map, token, index); - } - } - } else { - debug_assert!( - optimizable.is_empty(), - "Should be empty if optimization is off" - ); - } - - let memory = flat_builder.finish(filter_map); - - Self::try_from_verified_memory(memory).unwrap_or_default() - } - /// Returns the first found filter, if any, that matches the given request. The backing storage /// has a non-deterministic order, so this should be used for any category of filters where a /// match from each would be functionally equivalent. For example, if two different exception @@ -202,7 +80,7 @@ impl NetworkFilterList { active_tags: &HashSet, regex_manager: &mut RegexManager, ) -> Option { - let filters_list = self.memory.filter_list(); + let filters_list = self.list; if filters_list.filter_map_index().is_empty() { return None; @@ -212,7 +90,7 @@ impl NetworkFilterList { for token in request.get_tokens_for_match() { for (index, fb_filter) in filter_map.get(to_short_hash(*token)) { - let filter = FlatNetworkFilter::new(&fb_filter, index, self); + let filter = FlatNetworkFilter::new(&fb_filter, index, self.shared_state); // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) @@ -242,7 +120,7 @@ impl NetworkFilterList { ) -> Vec { let mut filters: Vec = vec![]; - let filters_list = self.memory.filter_list(); + let filters_list = self.list; if filters_list.filter_map_index().is_empty() { return filters; @@ -252,7 +130,7 @@ impl NetworkFilterList { for token in request.get_tokens_for_match() { for (index, fb_filter) in filter_map.get(to_short_hash(*token)) { - let filter = FlatNetworkFilter::new(&fb_filter, index, self); + let filter = FlatNetworkFilter::new(&fb_filter, index, self.shared_state); // if matched, also needs to be tagged with an active tag (or not tagged at all) if filter.matches(request, regex_manager) @@ -270,25 +148,6 @@ impl NetworkFilterList { } } -/// Inserts a value into the `Vec` under the specified key in the `HashMap`. The entry will be -/// created if it does not exist. If it already exists, it will be inserted in the `Vec` in a -/// sorted order. -pub(crate) fn insert_dup( - map: &mut HashMap, H>, - k: K, - v: V, -) where - K: std::cmp::Ord + std::hash::Hash, - V: PartialOrd, -{ - let entry = map.entry(k).or_default(); - - match entry.binary_search_by(|f| f.partial_cmp(&v).unwrap_or(std::cmp::Ordering::Equal)) { - Ok(_pos) => (), // Can occur if the exact same rule is inserted twice. No reason to add anything. - Err(slot) => entry.insert(slot, v), - } -} - pub(crate) fn token_histogram( filter_tokens: &[(T, Vec>)], ) -> (u32, HashMap) { diff --git a/tests/unit/blocker.rs b/tests/unit/blocker.rs index 3b67a1e2..04dd76f6 100644 --- a/tests/unit/blocker.rs +++ b/tests/unit/blocker.rs @@ -1472,17 +1472,17 @@ mod legacy_rule_parsing_tests { // Some filters in the filter_map are pointed at by multiple tokens, increasing the total number of items assert!( - blocker.exceptions.get_filter_map().total_size() - + blocker.generic_hide.get_filter_map().total_size() + blocker.exceptions().get_filter_map().total_size() + + blocker.generic_hide().get_filter_map().total_size() >= expectation.exceptions, "Number of collected exceptions does not match expectation" ); assert!( - blocker.filters.get_filter_map().total_size() - + blocker.importants.get_filter_map().total_size() - + blocker.redirects.get_filter_map().total_size() - + blocker.csp.get_filter_map().total_size() + blocker.filters().get_filter_map().total_size() + + blocker.importants().get_filter_map().total_size() + + blocker.redirects().get_filter_map().total_size() + + blocker.csp().get_filter_map().total_size() >= expectation.filters - expectation.duplicates, "Number of collected network filters does not match expectation" ); From 6c18fe33232927b08bb7f52e448214945b3fe339 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Fri, 20 Jun 2025 21:41:55 +0400 Subject: [PATCH 2/7] use VerifiedFlatbufferMemory in storage --- src/data_format/mod.rs | 8 ++++---- src/data_format/storage.rs | 13 ++++++------- src/engine.rs | 6 +++--- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/data_format/mod.rs b/src/data_format/mod.rs index 489c8ab5..b2e86263 100644 --- a/src/data_format/mod.rs +++ b/src/data_format/mod.rs @@ -9,8 +9,8 @@ mod storage; pub(crate) mod utils; -use crate::blocker::Blocker; use crate::cosmetic_filter_cache::CosmeticFilterCache; +use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; use crate::network_filter_list::NetworkFilterListParsingError; /// Newer formats start with this magic byte sequence. @@ -62,16 +62,16 @@ impl From for DeserializationError { } pub(crate) fn serialize_engine( - blocker: &Blocker, + flatbuffer_memory: &VerifiedFlatbufferMemory, cfc: &CosmeticFilterCache, ) -> Result, SerializationError> { - let serialize_format = storage::SerializeFormat::from((blocker, cfc)); + let serialize_format = storage::SerializeFormat::from((flatbuffer_memory, cfc)); serialize_format.serialize() } pub(crate) fn deserialize_engine( serialized: &[u8], -) -> Result<(Blocker, CosmeticFilterCache), DeserializationError> { +) -> Result<(VerifiedFlatbufferMemory, CosmeticFilterCache), DeserializationError> { let deserialize_format = storage::DeserializeFormat::deserialize(serialized)?; deserialize_format.try_into() } diff --git a/src/data_format/storage.rs b/src/data_format/storage.rs index 1b60301f..3ffeab14 100644 --- a/src/data_format/storage.rs +++ b/src/data_format/storage.rs @@ -9,7 +9,6 @@ use std::collections::{HashMap, HashSet}; use rmp_serde as rmps; use serde::{Deserialize, Serialize}; -use crate::blocker::Blocker; use crate::cosmetic_filter_cache::{CosmeticFilterCache, HostnameRuleDb, ProceduralOrActionFilter}; use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; use crate::utils::Hash; @@ -253,11 +252,11 @@ impl DeserializeFormat { } } -impl<'a> From<(&'a Blocker, &'a CosmeticFilterCache)> for SerializeFormat<'a> { - fn from(v: (&'a Blocker, &'a CosmeticFilterCache)) -> Self { - let (blocker, cfc) = v; +impl<'a> From<(&'a VerifiedFlatbufferMemory, &'a CosmeticFilterCache)> for SerializeFormat<'a> { + fn from(v: (&'a VerifiedFlatbufferMemory, &'a CosmeticFilterCache)) -> Self { + let (memory, cfc) = v; Self { - flatbuffer_memory: blocker.memory.data().to_vec(), + flatbuffer_memory: memory.data().to_vec(), resources: LegacyRedirectResourceStorage::default(), @@ -278,7 +277,7 @@ impl<'a> From<(&'a Blocker, &'a CosmeticFilterCache)> for SerializeFormat<'a> { } } -impl TryFrom for (Blocker, CosmeticFilterCache) { +impl TryFrom for (VerifiedFlatbufferMemory, CosmeticFilterCache) { fn try_from(v: DeserializeFormat) -> Result { use crate::cosmetic_filter_cache::HostnameFilterBin; @@ -291,7 +290,7 @@ impl TryFrom for (Blocker, CosmeticFilterCache) { .map_err(DeserializationError::FlatBufferParsingError)?; Ok(( - Blocker::from_verified_memory(memory), + memory, CosmeticFilterCache { simple_class_rules: v.simple_class_rules, simple_id_rules: v.simple_id_rules, diff --git a/src/engine.rs b/src/engine.rs index 1fe39093..7c20ead6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -247,7 +247,7 @@ impl Engine { /// Serializes the `Engine` into a binary format so that it can be quickly reloaded later. pub fn serialize(&self) -> Result, crate::data_format::SerializationError> { - crate::data_format::serialize_engine(&self.blocker, &self.cosmetic_cache) + crate::data_format::serialize_engine(&self.blocker.memory, &self.cosmetic_cache) } /// Deserialize the `Engine` from the binary format generated by `Engine::serialize`. @@ -260,8 +260,8 @@ impl Engine { serialized: &[u8], ) -> Result<(), crate::data_format::DeserializationError> { let current_tags = self.blocker.tags_enabled(); - let (blocker, cosmetic_cache) = crate::data_format::deserialize_engine(serialized)?; - self.blocker = blocker; + let (memory, cosmetic_cache) = crate::data_format::deserialize_engine(serialized)?; + self.blocker = Blocker::from_verified_memory(memory); self.blocker .use_tags(¤t_tags.iter().map(|s| &**s).collect::>()); self.cosmetic_cache = cosmetic_cache; From 27de61479180daf88d41ecd5ca11b605432b697d Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Sat, 21 Jun 2025 00:46:39 +0400 Subject: [PATCH 3/7] Move memory to Engine --- src/blocker.rs | 100 ++++++++---------------------------- src/engine.rs | 30 ++++++----- src/filters/fb_network.rs | 32 ++++++++++-- src/filters/flat_builder.rs | 64 ++++++++++++++++++++++- src/filters/unsafe_tools.rs | 10 ++++ src/network_filter_list.rs | 4 +- 6 files changed, 139 insertions(+), 101 deletions(-) diff --git a/src/blocker.rs b/src/blocker.rs index edb77aa3..474a96c3 100644 --- a/src/blocker.rs +++ b/src/blocker.rs @@ -3,18 +3,15 @@ use memchr::{memchr as find_char, memrchr as find_char_reverse}; use once_cell::sync::Lazy; use serde::Serialize; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::ops::DerefMut; -use crate::filters::fb_network::NetworkFilterSharedState; -use crate::filters::flat_builder::FlatBufferBuilder; -use crate::filters::network::{NetworkFilter, NetworkFilterMaskHelper}; -use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; +use crate::filters::fb_network::SharedStateRef; +use crate::filters::network::NetworkFilterMaskHelper; use crate::network_filter_list::NetworkFilterList; use crate::regex_manager::{RegexManager, RegexManagerDiscardPolicy}; use crate::request::Request; use crate::resources::ResourceStorage; -use crate::utils::Hash; /// Options used when constructing a [`Blocker`]. pub struct BlockerOptions { @@ -79,10 +76,9 @@ pub(crate) enum FilterId { TaggedFiltersAll = 7, Size = 8, } + /// Stores network filters for efficient querying. pub struct Blocker { - pub(crate) memory: VerifiedFlatbufferMemory, - // Enabled tags are not serialized - when deserializing, tags of the existing // instance (the one we are recreating lists into) are maintained pub(crate) tags_enabled: HashSet, @@ -92,7 +88,7 @@ pub struct Blocker { #[cfg(not(feature = "unsync-regex-caching"))] pub(crate) regex_manager: std::sync::Mutex, - pub(crate) shared_state: NetworkFilterSharedState, + pub(crate) shared_state: SharedStateRef, } impl Blocker { @@ -105,7 +101,7 @@ impl Blocker { pub(crate) fn get_list(&self, id: FilterId) -> NetworkFilterList { // TODO: verify lists() size and id is in range NetworkFilterList { - list: self.memory.root().lists().get(id as usize), + list: self.shared_state.memory.root().lists().get(id as usize), shared_state: &self.shared_state, } } @@ -439,82 +435,26 @@ impl Blocker { Some(merged) } - pub(crate) fn from_verified_memory(memory: VerifiedFlatbufferMemory) -> Self { - // Reconstruct the unique_domains_hashes_map from the flatbuffer data - let root = memory.root(); - let mut unique_domains_hashes_map: HashMap = HashMap::new(); - for (index, hash) in root.unique_domains_hashes().iter().enumerate() { - unique_domains_hashes_map.insert(hash, index as u32); - } - - let shared_state = NetworkFilterSharedState { - unique_domains_hashes_map, - }; - + pub(crate) fn from_shared_state(shared_state: SharedStateRef) -> Self { Self { + shared_state, tags_enabled: HashSet::new(), regex_manager: Default::default(), - memory, - shared_state, } } - pub fn new(mut network_filters: Vec, options: &BlockerOptions) -> Self { - // Injections - // TODO: resource handling - - let mut builder = FlatBufferBuilder::new(FilterId::Size as usize); - - let mut badfilter_ids: HashSet = HashSet::new(); - for filter in network_filters.iter() { - if filter.is_badfilter() { - badfilter_ids.insert(filter.get_id_without_badfilter()); - } - } - for filter in network_filters.drain(..) { - // skip any bad filters - let filter_id = filter.get_id(); - if badfilter_ids.contains(&filter_id) || filter.is_badfilter() { - continue; - } - - // Redirects are independent of blocking behavior. - if filter.is_redirect() { - builder.add_filter(filter.clone(), FilterId::Redirects as u32); - } - - let list_id: FilterId = if filter.is_csp() { - FilterId::Csp - } else if filter.is_removeparam() { - FilterId::RemoveParam - } else if filter.is_generic_hide() { - FilterId::GenericHide - } else if filter.is_exception() { - FilterId::Exceptions - } else if filter.is_important() { - FilterId::Importants - } else if filter.tag.is_some() && !filter.is_redirect() { - // `tag` + `redirect` is unsupported for now. - FilterId::TaggedFiltersAll - } else if (filter.is_redirect() && filter.also_block_redirect()) - || !filter.is_redirect() - { - FilterId::Filters - } else { - continue; - }; - - builder.add_filter(filter, list_id as u32); - } - - let memory = builder.finish(if options.enable_optimizations { - // Don't optimize removeparam, since it can fuse filters without respecting distinct - |id: u32| id != FilterId::RemoveParam as u32 - } else { - |_| false - }); - - Self::from_verified_memory(memory) + #[cfg(test)] + pub(crate) fn new( + network_filters: Vec, + options: &BlockerOptions, + ) -> Self { + use crate::filters::fb_network::SharedState; + use crate::filters::flat_builder::FlatBufferBuilder; + + let memory = + FlatBufferBuilder::make_flatbuffer(network_filters, options.enable_optimizations); + let shared_state = SharedState::new(memory); + Self::from_shared_state(shared_state) } pub fn use_tags(&mut self, tags: &[&str]) { diff --git a/src/engine.rs b/src/engine.rs index 7c20ead6..bce29554 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,7 +1,9 @@ //! The adblock [`Engine`] is the primary interface for adblocking. -use crate::blocker::{Blocker, BlockerOptions, BlockerResult}; +use crate::blocker::{Blocker, BlockerResult}; use crate::cosmetic_filter_cache::{CosmeticFilterCache, UrlSpecificResources}; +use crate::filters::fb_network::{SharedState, SharedStateRef}; +use crate::filters::flat_builder::FlatBufferBuilder; use crate::lists::{FilterSet, ParseOptions}; use crate::regex_manager::RegexManagerDiscardPolicy; use crate::request::Request; @@ -46,6 +48,7 @@ pub struct Engine { blocker: Blocker, cosmetic_cache: CosmeticFilterCache, resources: ResourceStorage, + shared_state: SharedStateRef, } impl Default for Engine { @@ -60,15 +63,16 @@ impl Engine { /// used with deserialization. /// - `optimize` specifies whether or not to attempt to compress the internal representation by /// combining similar rules. - pub fn new(optimize: bool) -> Self { - let blocker_options = BlockerOptions { - enable_optimizations: optimize, - }; + pub fn new(_optimize: bool) -> Self { + // TODO: remove _optimize? + + let shared_state = SharedState::new(Default::default()); Self { - blocker: Blocker::new(vec![], &blocker_options), + blocker: Blocker::from_shared_state(shared_state.clone()), cosmetic_cache: CosmeticFilterCache::new(), resources: ResourceStorage::default(), + shared_state, } } @@ -110,14 +114,15 @@ impl Engine { .. } = set; - let blocker_options = BlockerOptions { - enable_optimizations: optimize, - }; + let memory = FlatBufferBuilder::make_flatbuffer(network_filters, optimize); + + let shared_state = SharedState::new(memory); Self { - blocker: Blocker::new(network_filters, &blocker_options), + blocker: Blocker::from_shared_state(shared_state.clone()), cosmetic_cache: CosmeticFilterCache::from_rules(cosmetic_filters), resources: ResourceStorage::default(), + shared_state, } } @@ -247,7 +252,7 @@ impl Engine { /// Serializes the `Engine` into a binary format so that it can be quickly reloaded later. pub fn serialize(&self) -> Result, crate::data_format::SerializationError> { - crate::data_format::serialize_engine(&self.blocker.memory, &self.cosmetic_cache) + crate::data_format::serialize_engine(&self.shared_state.memory, &self.cosmetic_cache) } /// Deserialize the `Engine` from the binary format generated by `Engine::serialize`. @@ -261,7 +266,8 @@ impl Engine { ) -> Result<(), crate::data_format::DeserializationError> { let current_tags = self.blocker.tags_enabled(); let (memory, cosmetic_cache) = crate::data_format::deserialize_engine(serialized)?; - self.blocker = Blocker::from_verified_memory(memory); + self.shared_state = SharedState::new(memory); + self.blocker = Blocker::from_shared_state(self.shared_state.clone()); self.blocker .use_tags(¤t_tags.iter().map(|s| &**s).collect::>()); self.cosmetic_cache = cosmetic_cache; diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 3e1fd64e..3a007523 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use crate::filters::network::{NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable}; -use crate::filters::unsafe_tools::fb_vector_to_slice; +use crate::filters::unsafe_tools::{fb_vector_to_slice, VerifiedFlatbufferMemory}; use crate::regex_manager::RegexManager; use crate::request::Request; @@ -67,15 +67,37 @@ impl ExactSizeIterator for FlatPatternsIterator<'_> { } } -#[derive(Debug, Default)] -pub(crate) struct NetworkFilterSharedState { +// TODO: do we need another feature for this? +#[cfg(feature = "unsync-regex-caching")] +pub(crate) type SharedStateRef = std::rc::Rc; +#[cfg(not(feature = "unsync-regex-caching"))] +pub(crate) type SharedStateRef = std::rc::Arc; + +#[derive(Default)] +pub(crate) struct SharedState { + pub(crate) memory: VerifiedFlatbufferMemory, pub(crate) unique_domains_hashes_map: HashMap, } +impl SharedState { + pub(crate) fn new(memory: VerifiedFlatbufferMemory) -> SharedStateRef { + // Reconstruct the unique_domains_hashes_map from the flatbuffer data + let root = memory.root(); + let mut unique_domains_hashes_map: HashMap = HashMap::new(); + for (index, hash) in root.unique_domains_hashes().iter().enumerate() { + unique_domains_hashes_map.insert(hash, index as u32); + } + SharedStateRef::new(Self { + memory, + unique_domains_hashes_map, + }) + } +} + /// Internal implementation of [NetworkFilter] that is compatible with flatbuffers. pub(crate) struct FlatNetworkFilter<'a> { key: u64, - shared_state: &'a NetworkFilterSharedState, + shared_state: &'a SharedState, fb_filter: &'a fb::NetworkFilter<'a>, pub(crate) mask: NetworkFilterMask, @@ -86,7 +108,7 @@ impl<'a> FlatNetworkFilter<'a> { pub fn new( filter: &'a fb::NetworkFilter<'a>, index: usize, - shared_state: &'a NetworkFilterSharedState, + shared_state: &'a SharedState, ) -> Self { Self { fb_filter: filter, diff --git a/src/filters/flat_builder.rs b/src/filters/flat_builder.rs index af63f384..3f1c1b2d 100644 --- a/src/filters/flat_builder.rs +++ b/src/filters/flat_builder.rs @@ -1,11 +1,12 @@ //! Builder for creating flatbuffer-compatible Engine. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::vec; use flatbuffers::WIPOffset; -use crate::filters::network::NetworkFilter; +use crate::blocker::FilterId; +use crate::filters::network::{NetworkFilter, NetworkFilterMaskHelper}; use crate::filters::unsafe_tools::VerifiedFlatbufferMemory; use crate::network_filter_list::token_histogram; use crate::optimizer; @@ -261,4 +262,63 @@ impl FlatBufferBuilder { }, ) } + + pub fn make_flatbuffer( + mut network_filters: Vec, + optimize: bool, + ) -> VerifiedFlatbufferMemory { + // Injections + // TODO: resource handling + + let mut builder = FlatBufferBuilder::new(FilterId::Size as usize); + + let mut badfilter_ids: HashSet = HashSet::new(); + for filter in network_filters.iter() { + if filter.is_badfilter() { + badfilter_ids.insert(filter.get_id_without_badfilter()); + } + } + for filter in network_filters.drain(..) { + // skip any bad filters + let filter_id = filter.get_id(); + if badfilter_ids.contains(&filter_id) || filter.is_badfilter() { + continue; + } + + // Redirects are independent of blocking behavior. + if filter.is_redirect() { + builder.add_filter(filter.clone(), FilterId::Redirects as u32); + } + + let list_id: FilterId = if filter.is_csp() { + FilterId::Csp + } else if filter.is_removeparam() { + FilterId::RemoveParam + } else if filter.is_generic_hide() { + FilterId::GenericHide + } else if filter.is_exception() { + FilterId::Exceptions + } else if filter.is_important() { + FilterId::Importants + } else if filter.tag.is_some() && !filter.is_redirect() { + // `tag` + `redirect` is unsupported for now. + FilterId::TaggedFiltersAll + } else if (filter.is_redirect() && filter.also_block_redirect()) + || !filter.is_redirect() + { + FilterId::Filters + } else { + continue; + }; + + builder.add_filter(filter, list_id as u32); + } + + builder.finish(if optimize { + // Don't optimize removeparam, since it can fuse filters without respecting distinct + |id: u32| id != FilterId::RemoveParam as u32 + } else { + |_| false + }) + } } diff --git a/src/filters/unsafe_tools.rs b/src/filters/unsafe_tools.rs index f53edaa5..2dcd9d50 100644 --- a/src/filters/unsafe_tools.rs +++ b/src/filters/unsafe_tools.rs @@ -48,6 +48,16 @@ pub(crate) struct VerifiedFlatbufferMemory { start: usize, } +impl Default for VerifiedFlatbufferMemory { + fn default() -> Self { + // TODO: create an empty engine and get the memory from it + Self { + raw_data: vec![], + start: 0, + } + } +} + impl VerifiedFlatbufferMemory { pub(crate) fn from_raw(data: Vec) -> Result { let memory = Self::from_vec(data); diff --git a/src/network_filter_list.rs b/src/network_filter_list.rs index e80704cd..3f945997 100644 --- a/src/network_filter_list.rs +++ b/src/network_filter_list.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, collections::HashSet, fmt}; use crate::filters::fb_network::flat::fb; -use crate::filters::fb_network::{FlatNetworkFilter, NetworkFilterSharedState}; +use crate::filters::fb_network::{FlatNetworkFilter, SharedState}; use crate::filters::flat_filter_map::FlatFilterMap; use crate::filters::network::{ NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable, @@ -57,7 +57,7 @@ pub enum NetworkFilterListParsingError { /// Internal structure to keep track of a collection of network filters. pub(crate) struct NetworkFilterList<'a> { pub(crate) list: fb::NetworkFilterList<'a>, - pub(crate) shared_state: &'a NetworkFilterSharedState, + pub(crate) shared_state: &'a SharedState, } impl NetworkFilterList<'_> { From dd98fb34ff34c913b442f502cd0a1466660e035d Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Sat, 21 Jun 2025 01:08:01 +0400 Subject: [PATCH 4/7] Fix tests --- src/engine.rs | 2 +- src/filters/fb_network.rs | 12 +++++++++++- src/filters/unsafe_tools.rs | 10 ---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index bce29554..955b8a46 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -66,7 +66,7 @@ impl Engine { pub fn new(_optimize: bool) -> Self { // TODO: remove _optimize? - let shared_state = SharedState::new(Default::default()); + let shared_state = SharedStateRef::new(Default::default()); Self { blocker: Blocker::from_shared_state(shared_state.clone()), diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 3a007523..7ce86485 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; +use crate::filters::flat_builder::FlatBufferBuilder; use crate::filters::network::{NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable}; use crate::filters::unsafe_tools::{fb_vector_to_slice, VerifiedFlatbufferMemory}; @@ -73,12 +74,21 @@ pub(crate) type SharedStateRef = std::rc::Rc; #[cfg(not(feature = "unsync-regex-caching"))] pub(crate) type SharedStateRef = std::rc::Arc; -#[derive(Default)] pub(crate) struct SharedState { pub(crate) memory: VerifiedFlatbufferMemory, pub(crate) unique_domains_hashes_map: HashMap, } + +impl Default for SharedState { + fn default() -> Self { + Self { + memory: FlatBufferBuilder::make_flatbuffer(vec![], false), + unique_domains_hashes_map: HashMap::new(), + } + } +} + impl SharedState { pub(crate) fn new(memory: VerifiedFlatbufferMemory) -> SharedStateRef { // Reconstruct the unique_domains_hashes_map from the flatbuffer data diff --git a/src/filters/unsafe_tools.rs b/src/filters/unsafe_tools.rs index 2dcd9d50..f53edaa5 100644 --- a/src/filters/unsafe_tools.rs +++ b/src/filters/unsafe_tools.rs @@ -48,16 +48,6 @@ pub(crate) struct VerifiedFlatbufferMemory { start: usize, } -impl Default for VerifiedFlatbufferMemory { - fn default() -> Self { - // TODO: create an empty engine and get the memory from it - Self { - raw_data: vec![], - start: 0, - } - } -} - impl VerifiedFlatbufferMemory { pub(crate) fn from_raw(data: Vec) -> Result { let memory = Self::from_vec(data); From a4537ae3ee6ce08f2086e8c9b9cad6cebfa23dbf Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Sat, 21 Jun 2025 01:09:14 +0400 Subject: [PATCH 5/7] Fix deserialization tests --- tests/unit/engine.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/engine.rs b/tests/unit/engine.rs index ed83ca0d..ae7b6cea 100644 --- a/tests/unit/engine.rs +++ b/tests/unit/engine.rs @@ -183,7 +183,7 @@ mod tests { fn deserialization_generate_simple() { let mut engine = Engine::from_rules(["ad-banner"], Default::default()); let data = engine.serialize().unwrap(); - const EXPECTED_HASH: u64 = 5723845290597955159; + const EXPECTED_HASH: u64 = 14059407383857257100; assert_eq!(hash(&data), EXPECTED_HASH, "{}", HASH_MISMATCH_MSG); engine.deserialize(&data).unwrap(); } @@ -193,7 +193,7 @@ mod tests { let mut engine = Engine::from_rules(["ad-banner$tag=abc"], Default::default()); engine.use_tags(&["abc"]); let data = engine.serialize().unwrap(); - const EXPECTED_HASH: u64 = 9626816743810307798; + const EXPECTED_HASH: u64 = 1772924818985173219; assert_eq!(hash(&data), EXPECTED_HASH, "{}", HASH_MISMATCH_MSG); engine.deserialize(&data).unwrap(); } @@ -219,9 +219,9 @@ mod tests { let data = engine.serialize().unwrap(); let expected_hash = if cfg!(feature = "css-validation") { - 7254547691107602751 + 12046041060659687422 } else { - 4130628479730907288 + 11420623023091203502 }; assert_eq!(hash(&data), expected_hash, "{}", HASH_MISMATCH_MSG); From bf7d7001403e1deee0c56590d9dfa15eba7650e6 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Sat, 21 Jun 2025 01:09:35 +0400 Subject: [PATCH 6/7] cargo fmt --- src/filters/fb_network.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/filters/fb_network.rs b/src/filters/fb_network.rs index 7ce86485..9a9e3e59 100644 --- a/src/filters/fb_network.rs +++ b/src/filters/fb_network.rs @@ -79,7 +79,6 @@ pub(crate) struct SharedState { pub(crate) unique_domains_hashes_map: HashMap, } - impl Default for SharedState { fn default() -> Self { Self { From a1042ef2cae994e815430269bab62fdca9264884 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Sat, 21 Jun 2025 01:15:02 +0400 Subject: [PATCH 7/7] return the old Blocker::new --- src/blocker.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/blocker.rs b/src/blocker.rs index 474a96c3..f7ece74e 100644 --- a/src/blocker.rs +++ b/src/blocker.rs @@ -443,7 +443,8 @@ impl Blocker { } } - #[cfg(test)] + // TODO: only for tests and benchmarks + #[allow(dead_code)] pub(crate) fn new( network_filters: Vec, options: &BlockerOptions,