From a728c1abfdcd1004d5ea7d75b65e09fa92e23d05 Mon Sep 17 00:00:00 2001 From: Matthias Heinzel Date: Tue, 21 Jan 2025 12:55:23 +0100 Subject: [PATCH 1/4] Merge micro bench: fix benchmark setup The minimum length of keys for the compact index increased from 6 to 8 bytes at some point. Also, releasing a run was changed to remove the files associated with it. The cleanup code was manually doing the same, which then became unnecessary and started causing issues. Merge micro bench: release run during cleanup only --- .../Bench/Database/LSMTree/Internal/Merge.hs | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/bench/micro/Bench/Database/LSMTree/Internal/Merge.hs b/bench/micro/Bench/Database/LSMTree/Internal/Merge.hs index be3bc17a2..d8c8f10bc 100644 --- a/bench/micro/Bench/Database/LSMTree/Internal/Merge.hs +++ b/bench/micro/Bench/Database/LSMTree/Internal/Merge.hs @@ -1,12 +1,13 @@ module Bench.Database.LSMTree.Internal.Merge (benchmarks) where -import Control.Monad (when, zipWithM) +import Control.Monad (zipWithM) import Control.RefCount import Criterion.Main (Benchmark, bench, bgroup) import qualified Criterion.Main as Cr import Data.Bifunctor (first) import qualified Data.BloomFilter.Hash as Hash import Data.Foldable (traverse_) +import Data.IORef import qualified Data.List as List import qualified Data.Map.Strict as Map import Data.Maybe (fromMaybe) @@ -19,8 +20,7 @@ import Database.LSMTree.Extras.UTxO import Database.LSMTree.Internal.Entry import Database.LSMTree.Internal.Merge (MergeType (..)) import qualified Database.LSMTree.Internal.Merge as Merge -import Database.LSMTree.Internal.Paths (RunFsPaths (..), - pathsForRunFiles, runChecksumsPath) +import Database.LSMTree.Internal.Paths (RunFsPaths (..)) import Database.LSMTree.Internal.Run (Run) import qualified Database.LSMTree.Internal.Run as Run import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..)) @@ -114,7 +114,7 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.Merge" [ { name = "insert-large-keys-x4" -- potentially long keys , nentries = (totalEntries `div` 10) `splitInto` 4 , finserts = 1 - , randomKey = first serialiseKey . R.randomByteStringR (6, 4000) + , randomKey = first serialiseKey . R.randomByteStringR (8, 4000) } , benchMerge configWord64 { name = "insert-mixed-vals-x4" -- potentially long values @@ -201,14 +201,16 @@ benchMerge conf@Config{name} = -- thread `runs` through the environment, too. -- 2. It forces the result to normal form, which would traverse the -- whole run, so we force to WHNF ourselves and just return `()`. + + -- We make sure to immediately close resulting runs so we don't run + -- out of file handles or disk space. However, we don't want it to + -- be part of the measurement, as it includes deleting files. + -- Therefore, ... TODO Cr.perRunEnvWithCleanup - (pure (runs, outputRunPaths)) - (const (removeOutputRunFiles hasFS)) $ \(runs', p) -> do - !run <- merge hasFS hasBlockIO conf p runs' - -- Make sure to immediately close resulting runs so we don't run - -- out of file handles. Ideally this would not be measured, but at - -- least it's pretty cheap. - releaseRef run + ((runs,) <$> newIORef Nothing) + (releaseRun . snd) $ \(runs', ref) -> do + !run <- merge hasFS hasBlockIO conf outputRunPaths runs' + writeIORef ref $ Just $ releaseRef run ] where withEnv = @@ -216,13 +218,11 @@ benchMerge conf@Config{name} = (mergeEnv conf) mergeEnvCleanup - -- We need to keep the input runs, but remove the freshly created one. - removeOutputRunFiles :: FS.HasFS IO FS.HandleIO -> IO () - removeOutputRunFiles hasFS = do - traverse_ (FS.removeFile hasFS) (pathsForRunFiles outputRunPaths) - exists <- FS.doesFileExist hasFS (runChecksumsPath outputRunPaths) - when exists $ - FS.removeFile hasFS (runChecksumsPath outputRunPaths) + releaseRun :: IORef (Maybe (IO ())) -> IO () + releaseRun ref = + readIORef ref >>= \case + Nothing -> pure () + Just release -> release merge :: FS.HasFS IO FS.HandleIO From 73f9e372a84d06c5e8a0529ccf25a57682a7086b Mon Sep 17 00:00:00 2001 From: Matthias Heinzel Date: Tue, 21 Jan 2025 16:17:17 +0100 Subject: [PATCH 2/4] Lookup micro bench: fix file paths in setup This previously tried creating the wbblobs file with an empty path, so there was an existing directory (the benchmark's root directory) already where the file was supposed to go. --- bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs index 39bf4d401..0b75b5161 100644 --- a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs +++ b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs @@ -193,7 +193,7 @@ lookupsInBatchesEnv Config {..} = do hasBlockIO <- FS.ioHasBlockIO hasFS (fromMaybe FS.defaultIOCtxParams ioctxps) let wb = WB.fromMap storedKeys fsps = RunFsPaths (FS.mkFsPath []) (RunNumber 0) - wbblobs <- WBB.new hasFS (FS.mkFsPath []) + wbblobs <- WBB.new hasFS (FS.mkFsPath ["0.wbblobs"]) r <- Run.fromWriteBuffer hasFS hasBlockIO caching (RunAllocFixed 10) fsps wb wbblobs let NumEntries nentriesReal = Run.size r assert (nentriesReal == nentries) $ pure () From 8c06336921dcc97ec940625ce306d697dc75b555 Mon Sep 17 00:00:00 2001 From: Matthias Heinzel Date: Tue, 21 Jan 2025 16:40:54 +0100 Subject: [PATCH 3/4] Lookup micro bench: actually generate blobs With recent changes related to WriteBufferBlobs, flushing a write buffer started copying its blobs to a new file. This made the setup fail, since it generated random blob references, but no actual blobs they point at. --- .../Bench/Database/LSMTree/Internal/Lookup.hs | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs index 0b75b5161..3d4a01d7f 100644 --- a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs +++ b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs @@ -2,7 +2,6 @@ module Bench.Database.LSMTree.Internal.Lookup (benchmarks) where -import Control.Exception (assert) import Control.Monad import Control.Monad.ST.Strict (stToIO) import Control.RefCount @@ -11,16 +10,16 @@ import Criterion.Main (Benchmark, bench, bgroup, env, envWithCleanup, import Data.Arena (ArenaManager, closeArena, newArena, newArenaManager, withArena) import Data.Bifunctor (Bifunctor (..)) +import Data.ByteString (ByteString) import qualified Data.List as List import Data.Map.Strict (Map) import qualified Data.Map.Strict as Map import Data.Maybe (fromMaybe) import qualified Data.Vector as V import Database.LSMTree.Extras.Orphans () -import Database.LSMTree.Extras.Random (frequency, +import Database.LSMTree.Extras.Random (frequency, randomByteStringR, sampleUniformWithReplacement, uniformWithoutReplacement) import Database.LSMTree.Extras.UTxO -import Database.LSMTree.Internal.BlobRef (BlobSpan (..)) import Database.LSMTree.Internal.Entry (Entry (..), NumEntries (..)) import Database.LSMTree.Internal.Lookup (bloomQueries, indexSearches, intraPageLookups, lookupsIO, prepLookups) @@ -34,6 +33,7 @@ import Database.LSMTree.Internal.Serialise import qualified Database.LSMTree.Internal.WriteBuffer as WB import qualified Database.LSMTree.Internal.WriteBufferBlobs as WBB import GHC.Exts (RealWorld) +import GHC.Stack (HasCallStack) import Prelude hiding (getContents) import System.Directory (removeDirectoryRecursive) import qualified System.FS.API as FS @@ -191,15 +191,14 @@ lookupsInBatchesEnv Config {..} = do (storedKeys, lookupKeys) <- lookupsEnv (mkStdGen 17) nentries npos nneg let hasFS = FS.ioHasFS (FS.MountPoint benchTmpDir) hasBlockIO <- FS.ioHasBlockIO hasFS (fromMaybe FS.defaultIOCtxParams ioctxps) - let wb = WB.fromMap storedKeys - fsps = RunFsPaths (FS.mkFsPath []) (RunNumber 0) wbblobs <- WBB.new hasFS (FS.mkFsPath ["0.wbblobs"]) + wb <- WB.fromMap <$> traverse (traverse (WBB.addBlob hasFS wbblobs)) storedKeys + let fsps = RunFsPaths (FS.mkFsPath []) (RunNumber 0) r <- Run.fromWriteBuffer hasFS hasBlockIO caching (RunAllocFixed 10) fsps wb wbblobs let NumEntries nentriesReal = Run.size r - assert (nentriesReal == nentries) $ pure () - let npagesReal = Run.sizeInPages r - assert (getNumPages npagesReal * 42 <= nentriesReal) $ pure () - assert (getNumPages npagesReal * 43 >= nentriesReal) $ pure () + assertEqual nentriesReal nentries $ pure () + -- 42 to 43 entries per page + assertEqual (nentriesReal `div` getNumPages (Run.sizeInPages r)) 42 $ pure () pure ( benchTmpDir , arenaManager , hasFS @@ -228,8 +227,8 @@ lookupsEnv :: -> Int -- ^ Number of stored key\/operation pairs -> Int -- ^ Number of positive lookups -> Int -- ^ Number of negative lookups - -> IO ( Map SerialisedKey (Entry SerialisedValue BlobSpan) - , V.Vector (SerialisedKey) + -> IO ( Map SerialisedKey (Entry SerialisedValue SerialisedBlob) + , V.Vector SerialisedKey ) lookupsEnv g nentries npos nneg = do let (g1, g') = R.split g @@ -242,25 +241,26 @@ lookupsEnv g nentries npos nneg = do lookups <- generate $ shuffle (negLookups ++ posLookups) let entries' = Map.mapKeys serialiseKey - $ Map.map (bimap serialiseValue id) entries + $ Map.map (bimap serialiseValue serialiseBlob) entries lookups' = V.fromList $ fmap serialiseKey lookups - assert (Map.size entries' == nentries) $ pure () - assert (length lookups' == npos + nneg) $ pure () + assertEqual (Map.size entries') (nentries) $ pure () + assertEqual (length lookups') (npos + nneg) $ pure () pure (entries', lookups') -- TODO: tweak distribution -randomEntry :: StdGen -> (Entry UTxOValue BlobSpan, StdGen) +randomEntry :: StdGen -> (Entry UTxOValue ByteString, StdGen) randomEntry g = frequency [ (20, \g' -> let (!v, !g'') = uniform g' in (Insert v, g'')) , (1, \g' -> let (!v, !g'') = uniform g' - (!b, !g''') = genBlobSpan g'' + (!b, !g''') = randomByteStringR (0, 2000) g'' -- < 2kB in (InsertWithBlob v b, g''')) , (2, \g' -> let (!v, !g'') = uniform g' in (Mupdate v, g'')) , (2, \g' -> (Delete, g')) ] g -genBlobSpan :: RandomGen g => g -> (BlobSpan, g) -genBlobSpan !g = - let (off, !g') = uniform g - (len, !g'') = uniform g' - in (BlobSpan off len, g'') +-- | Assertions on the generated environment should also be checked for release +-- builds, so don't use 'Control.Exception.assert'. +assertEqual :: (HasCallStack, Eq a, Show a) => a -> a -> b -> b +assertEqual x y + | x == y = id + | otherwise = error $ show x ++ " /= " ++ show y From a7114d1060f487ad2ac1d1605fcb88160692d86f Mon Sep 17 00:00:00 2001 From: Matthias Heinzel Date: Tue, 21 Jan 2025 17:09:13 +0100 Subject: [PATCH 4/4] Lookup micro bench: use WriteBufferBlobs from env This fixes the file paths errors for the write buffer blobs. It is also generally the right direction to go: If we ever want these tests to work with a write buffer and blobs, lookups must look at the original write buffer blobs, not an empty one. --- .../Bench/Database/LSMTree/Internal/Lookup.hs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs index 3d4a01d7f..ab078b197 100644 --- a/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs +++ b/bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs @@ -83,7 +83,7 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.Lookup" [ benchLookups :: Config -> Benchmark benchLookups conf@Config{name} = - withEnv $ \ ~(_dir, arenaManager, hasFS, hasBlockIO, rs, ks) -> + withEnv $ \ ~(_dir, arenaManager, _hasFS, hasBlockIO, wbblobs, rs, ks) -> env ( pure ( V.map (\(DeRef r) -> Run.runFilter r) rs , V.map (\(DeRef r) -> Run.runIndex r) rs , V.map (\(DeRef r) -> Run.runKOpsFile r) rs @@ -126,28 +126,24 @@ benchLookups conf@Config{name} = ( do arena <- newArena arenaManager (rkixs, ioops) <- stToIO (prepLookups arena blooms indexes kopsFiles ks) ioress <- FS.submitIO hasBlockIO ioops - wbblobs <- WBB.new hasFS (FS.mkFsPath []) - pure (rkixs, ioops, ioress, arena, wbblobs) + pure (rkixs, ioops, ioress, arena) ) - (\(_, _, _, arena, wbblobs) -> do - closeArena arenaManager arena - releaseRef wbblobs) - (\ ~(rkixs, ioops, ioress, _, wbblobs_unused) -> do - !_ <- intraPageLookups resolveV WB.empty wbblobs_unused + (\(_, _, _, arena) -> closeArena arenaManager arena) + (\ ~(rkixs, ioops, ioress, _) -> do + !_ <- intraPageLookups resolveV WB.empty wbblobs rs ks rkixs ioops ioress pure ()) -- The whole shebang: lookup preparation, doing the IO, and then -- performing intra-page-lookups. Again, we evaluate the result to -- WHNF because it is the same result that intraPageLookups produces -- (see above). - , let wb_unused = WB.empty in - env (WBB.new hasFS (FS.mkFsPath [])) $ \wbblobs_unused -> - bench "Lookups in IO" $ + , bench "Lookups in IO" $ whnfAppIO (\ks' -> lookupsIO hasBlockIO arenaManager resolveV - wb_unused wbblobs_unused + WB.empty wbblobs rs blooms indexes kopsFiles ks') ks ] - --TODO: consider adding benchmarks that also use the write buffer + -- TODO: consider adding benchmarks that also use the write buffer + -- (then we can't just use 'WB.empty', but must take it from the env) where withEnv = envWithCleanup (lookupsInBatchesEnv conf) @@ -181,6 +177,7 @@ lookupsInBatchesEnv :: , ArenaManager RealWorld , FS.HasFS IO FS.HandleIO , FS.HasBlockIO IO FS.HandleIO + , Ref (WBB.WriteBufferBlobs IO FS.HandleIO) , V.Vector (Ref (Run IO FS.HandleIO)) , V.Vector SerialisedKey ) @@ -203,6 +200,7 @@ lookupsInBatchesEnv Config {..} = do , arenaManager , hasFS , hasBlockIO + , wbblobs , V.singleton r , lookupKeys ) @@ -212,13 +210,15 @@ lookupsInBatchesCleanup :: , ArenaManager RealWorld , FS.HasFS IO FS.HandleIO , FS.HasBlockIO IO FS.HandleIO + , Ref (WBB.WriteBufferBlobs IO FS.HandleIO) , V.Vector (Ref (Run IO FS.HandleIO)) , V.Vector SerialisedKey ) -> IO () -lookupsInBatchesCleanup (tmpDir, _arenaManager, _hasFS, hasBlockIO, rs, _) = do +lookupsInBatchesCleanup (tmpDir, _arenaManager, _hasFS, hasBlockIO, wbblobs, rs, _) = do FS.close hasBlockIO forM_ rs releaseRef + releaseRef wbblobs removeDirectoryRecursive tmpDir -- | Generate keys to store and keys to lookup