From 50d10e4c2b9dece40b5b6dc19529b028a060a8cd Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Fri, 8 Aug 2025 17:43:51 +0200 Subject: [PATCH 1/7] [df] Remove unused member in RNTuple snapshotting --- tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx | 6 +++--- tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx | 4 +--- tree/dataframe/src/RDFSnapshotHelpers.cxx | 11 ++++------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index a7b2fdfb81ed4..70ba4968d022b 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -298,9 +298,9 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr; - actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, inputLM, - outputLM, std::move(isDefine), colTypeIDs), - colNames, colTypeIDs, prevNode, colRegister)); + actionPtr.reset(new Action_t( + Helper_t(filename, dirname, treename, colNames, outputColNames, options, inputLM, outputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); } else { // multi-thread snapshot to RNTuple is not yet supported // TODO(fdegeus) Add MT snapshotting diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index e7ebca8041e95..7968a57b66d4c 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -67,15 +67,13 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction ROOT::REntry *fOutputEntry; - std::vector fIsDefine; - std::vector fInputColumnTypeIDs; // Types for the input columns public: UntypedSnapshotRNTupleHelper(std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, - ROOT::Detail::RDF::RLoopManager *outputLM, std::vector &&isDefine, + ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector &colTypeIDs); UntypedSnapshotRNTupleHelper(const UntypedSnapshotRNTupleHelper &) = delete; diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 94c254f4d7078..1580b3ff7b7a8 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -801,8 +801,7 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::s ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, - ROOT::Detail::RDF::RLoopManager *outputLM, std::vector &&isDefine, - const std::vector &colTypeIDs) + ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector &colTypeIDs) : fFileName(filename), fDirName(dirname), fNTupleName(ntuplename), @@ -814,7 +813,6 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( fOutputFieldNames(ReplaceDotWithUnderscore(fnames)), fWriter(nullptr), fOutputEntry(nullptr), - fIsDefine(std::move(isDefine)), fInputColumnTypeIDs(colTypeIDs) { EnsureValidSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName); @@ -899,8 +897,7 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::MakeNew(void *newName) { const std::string finalName = *reinterpret_cast(newName); - return UntypedSnapshotRNTupleHelper{finalName, fDirName, fNTupleName, - fInputFieldNames, fOutputFieldNames, fOptions, - fInputLoopManager, fOutputLoopManager, std::vector(fIsDefine), - fInputColumnTypeIDs}; + return UntypedSnapshotRNTupleHelper{finalName, fDirName, fNTupleName, + fInputFieldNames, fOutputFieldNames, fOptions, + fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs}; } From 7ad5d753f69aff7053f01d946d5de9641eb195b2 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 11 Aug 2025 11:09:16 +0200 Subject: [PATCH 2/7] [df] Simplify member init in RNTuple snapshotting --- tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx | 2 +- tree/dataframe/src/RDFSnapshotHelpers.cxx | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 7968a57b66d4c..37df5cbb76666 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -65,7 +65,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction ColumnNames_t fOutputFieldNames; std::unique_ptr fWriter; - ROOT::REntry *fOutputEntry; + ROOT::REntry *fOutputEntry = nullptr; std::vector fInputColumnTypeIDs; // Types for the input columns diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 1580b3ff7b7a8..283c17912d9d0 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -805,14 +805,11 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( : fFileName(filename), fDirName(dirname), fNTupleName(ntuplename), - fOutputFile(nullptr), fOptions(options), fInputLoopManager(inputLM), fOutputLoopManager(outputLM), fInputFieldNames(vfnames), fOutputFieldNames(ReplaceDotWithUnderscore(fnames)), - fWriter(nullptr), - fOutputEntry(nullptr), fInputColumnTypeIDs(colTypeIDs) { EnsureValidSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName); From 8aca0a6b21eb787e720a7589122c3767e6faa6f4 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Fri, 8 Aug 2025 17:30:47 +0200 Subject: [PATCH 3/7] [df] Reorder helper methods for RNTuple snapshotting --- .../dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx | 4 ++-- tree/dataframe/src/RDFSnapshotHelpers.cxx | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 37df5cbb76666..3f313be2cd1f3 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -82,12 +82,12 @@ public: UntypedSnapshotRNTupleHelper &operator=(UntypedSnapshotRNTupleHelper &&) noexcept; ~UntypedSnapshotRNTupleHelper() final; + void Initialize(); + void InitTask(TTreeReader *, unsigned int /* slot */) {} void Exec(unsigned int /* slot */, const std::vector &values); - void Initialize(); - void Finalize(); std::string GetActionName() { return "Snapshot"; } diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 283c17912d9d0..e6408f59e9009 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -827,15 +827,6 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::~UntypedSnapshotRNTupleHelper Warning("Snapshot", "A lazy Snapshot action was booked but never triggered."); } -void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slot */, const std::vector &values) -{ - assert(values.size() == fOutputFieldNames.size()); - for (decltype(values.size()) i = 0; i < values.size(); i++) { - fOutputEntry->BindRawPtr(fOutputFieldNames[i], values[i]); - } - fWriter->Fill(); -} - void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() { auto model = ROOT::RNTupleModel::Create(); @@ -872,6 +863,15 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() fWriter = ROOT::RNTupleWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions); } +void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slot */, const std::vector &values) +{ + assert(values.size() == fOutputFieldNames.size()); + for (decltype(values.size()) i = 0; i < values.size(); i++) { + fOutputEntry->BindRawPtr(fOutputFieldNames[i], values[i]); + } + fWriter->Fill(); +} + void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Finalize() { fWriter.reset(); From 924d5a230f9347a899514570d9ef52b8f44e2a28 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Fri, 8 Aug 2025 17:06:42 +0200 Subject: [PATCH 4/7] [df] Use bare REntry for RNTuple snapshotting ... instead of the default entry. Then we also only need a bare model. --- tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx | 2 +- tree/dataframe/src/RDFSnapshotHelpers.cxx | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 3f313be2cd1f3..b066126f8c97c 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -65,7 +65,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction ColumnNames_t fOutputFieldNames; std::unique_ptr fWriter; - ROOT::REntry *fOutputEntry = nullptr; + std::unique_ptr fOutputEntry; std::vector fInputColumnTypeIDs; // Types for the input columns diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index e6408f59e9009..e0ed4e02266e5 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -829,7 +829,7 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::~UntypedSnapshotRNTupleHelper void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() { - auto model = ROOT::RNTupleModel::Create(); + auto model = ROOT::RNTupleModel::CreateBare(); auto nFields = fOutputFieldNames.size(); for (decltype(nFields) i = 0; i < nFields; i++) { // Need to retrieve the type of every field to create as a string @@ -841,7 +841,8 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() : ROOT::Internal::RDF::TypeID2TypeName(*fInputColumnTypeIDs[i]); model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap()); } - fOutputEntry = &model->GetDefaultEntry(); + model->Freeze(); + fOutputEntry = model->CreateBareEntry(); ROOT::RNTupleWriteOptions writeOptions; writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); @@ -869,7 +870,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slo for (decltype(values.size()) i = 0; i < values.size(); i++) { fOutputEntry->BindRawPtr(fOutputFieldNames[i], values[i]); } - fWriter->Fill(); + fWriter->Fill(*fOutputEntry); } void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Finalize() From 30ce048dae642ced0e456f17317f56254015dbdf Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 11 Aug 2025 10:26:12 +0200 Subject: [PATCH 5/7] [df] Use field tokens for RNTuple snapshotting This is less expensive than string comparisons of field names during every call to Exec(). --- tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx | 2 ++ tree/dataframe/src/RDFSnapshotHelpers.cxx | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index b066126f8c97c..cf7ba36f8ea55 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -32,6 +32,7 @@ class TFile; namespace ROOT { class RNTupleWriter; class REntry; +class RFieldToken; class TBufferMerger; class TBufferMergerFile; } // namespace ROOT @@ -64,6 +65,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction ColumnNames_t fInputFieldNames; // This contains the resolved aliases ColumnNames_t fOutputFieldNames; std::unique_ptr fWriter; + std::vector fFieldTokens; std::unique_ptr fOutputEntry; diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index e0ed4e02266e5..8711467cba2f6 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -831,6 +832,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() { auto model = ROOT::RNTupleModel::CreateBare(); auto nFields = fOutputFieldNames.size(); + fFieldTokens.resize(nFields); for (decltype(nFields) i = 0; i < nFields; i++) { // Need to retrieve the type of every field to create as a string // If the input type for a field does not have RTTI, internally we store it as the tag UseNativeDataType. When @@ -840,6 +842,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() fInputFieldNames[i], fOptions.fVector2RVec) : ROOT::Internal::RDF::TypeID2TypeName(*fInputColumnTypeIDs[i]); model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap()); + fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]); } model->Freeze(); fOutputEntry = model->CreateBareEntry(); @@ -866,9 +869,9 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slot */, const std::vector &values) { - assert(values.size() == fOutputFieldNames.size()); + assert(values.size() == fFieldTokens.size()); for (decltype(values.size()) i = 0; i < values.size(); i++) { - fOutputEntry->BindRawPtr(fOutputFieldNames[i], values[i]); + fOutputEntry->BindRawPtr(fFieldTokens[i], values[i]); } fWriter->Fill(*fOutputEntry); } From 09bff549a728642ba40bfd50761e94232ec3e9de Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 11 Aug 2025 11:03:52 +0200 Subject: [PATCH 6/7] [df] Support MT snapshotting to RNTuple Switch the existing code to use the RNTupleParallelWriter with one RNTupleFillContext per slot. For sequential snapshotting, this should be (almost) as efficient as the RNTupleWriter (one additional cloned RNTupleModel for the only fill context), but save quite a bit of code duplication and in testing effort. --- .../dataframe/inc/ROOT/RDF/InterfaceUtils.hxx | 18 +++---- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 21 +++++--- tree/dataframe/src/RDFSnapshotHelpers.cxx | 51 ++++++++++++++----- .../test/dataframe_snapshot_ntuple.cxx | 22 ++++---- 4 files changed, 71 insertions(+), 41 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index 70ba4968d022b..cab252e336494 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -293,19 +293,13 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr actionPtr; if (snapHelperArgs->fToNTuple) { - if (!ROOT::IsImplicitMTEnabled()) { - // single-thread snapshot - using Helper_t = UntypedSnapshotRNTupleHelper; - using Action_t = RActionSnapshot; + // We use the same helper for single- and multi-thread snapshot. + using Helper_t = UntypedSnapshotRNTupleHelper; + using Action_t = RActionSnapshot; - actionPtr.reset(new Action_t( - Helper_t(filename, dirname, treename, colNames, outputColNames, options, inputLM, outputLM, colTypeIDs), - colNames, colTypeIDs, prevNode, colRegister)); - } else { - // multi-thread snapshot to RNTuple is not yet supported - // TODO(fdegeus) Add MT snapshotting - throw std::runtime_error("Snapshot: Snapshotting to RNTuple with IMT enabled is not supported yet."); - } + actionPtr.reset(new Action_t(Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, + inputLM, outputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); } else { if (!ROOT::IsImplicitMTEnabled()) { // single-thread snapshot diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index cf7ba36f8ea55..ef10695b6c4ca 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -30,7 +30,10 @@ class TBranch; class TFile; namespace ROOT { -class RNTupleWriter; +namespace Experimental { +class RNTupleFillContext; +class RNTupleParallelWriter; +} // namespace Experimental class REntry; class RFieldToken; class TBufferMerger; @@ -64,16 +67,18 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RAction ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ColumnNames_t fInputFieldNames; // This contains the resolved aliases ColumnNames_t fOutputFieldNames; - std::unique_ptr fWriter; + std::unique_ptr fWriter; std::vector fFieldTokens; - std::unique_ptr fOutputEntry; + unsigned int fNSlots; + std::vector> fFillContexts; + std::vector> fEntries; std::vector fInputColumnTypeIDs; // Types for the input columns public: - UntypedSnapshotRNTupleHelper(std::string_view filename, std::string_view dirname, std::string_view ntuplename, - const ColumnNames_t &vfnames, const ColumnNames_t &fnames, + UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname, + std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector &colTypeIDs); @@ -86,9 +91,11 @@ public: void Initialize(); - void InitTask(TTreeReader *, unsigned int /* slot */) {} + void Exec(unsigned int slot, const std::vector &values); + + void InitTask(TTreeReader *, unsigned int slot); - void Exec(unsigned int /* slot */, const std::vector &values); + void FinalizeTask(unsigned int slot); void Finalize(); diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 8711467cba2f6..e55c253eadf49 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include #include @@ -800,9 +801,10 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::s } ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( - std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, - const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, - ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector &colTypeIDs) + unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename, + const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, + ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, + const std::vector &colTypeIDs) : fFileName(filename), fDirName(dirname), fNTupleName(ntuplename), @@ -811,6 +813,9 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( fOutputLoopManager(outputLM), fInputFieldNames(vfnames), fOutputFieldNames(ReplaceDotWithUnderscore(fnames)), + fNSlots(nSlots), + fFillContexts(nSlots), + fEntries(nSlots), fInputColumnTypeIDs(colTypeIDs) { EnsureValidSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName); @@ -845,7 +850,6 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]); } model->Freeze(); - fOutputEntry = model->CreateBareEntry(); ROOT::RNTupleWriteOptions writeOptions; writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); @@ -864,20 +868,43 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() outputDir = fOutputFile->mkdir(fDirName.c_str()); } - fWriter = ROOT::RNTupleWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions); + // The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for + // calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask). + fWriter = ROOT::Experimental::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions); +} + +void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::InitTask(TTreeReader *, unsigned int slot) +{ + if (!fFillContexts[slot]) { + fFillContexts[slot] = fWriter->CreateFillContext(); + fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry(); + } } -void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int /* slot */, const std::vector &values) +void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector &values) { + auto &fillContext = fFillContexts[slot]; + auto &outputEntry = fEntries[slot]; assert(values.size() == fFieldTokens.size()); for (decltype(values.size()) i = 0; i < values.size(); i++) { - fOutputEntry->BindRawPtr(fFieldTokens[i], values[i]); + outputEntry->BindRawPtr(fFieldTokens[i], values[i]); } - fWriter->Fill(*fOutputEntry); + fillContext->Fill(*outputEntry); +} + +void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::FinalizeTask(unsigned int slot) +{ + // In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression. + // NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush. + fFillContexts[slot]->FlushCluster(); } void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Finalize() { + // First clear and destroy all entries, which were created from the RNTupleFillContexts. + fEntries.clear(); + fFillContexts.clear(); + // Then destroy the RNTupleParallelWriter and write the metadata. fWriter.reset(); // We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call. fOutputLoopManager->SetDataSource(std::make_unique(fDirName + "/" + fNTupleName, fFileName)); @@ -898,7 +925,7 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::MakeNew(void *newName) { const std::string finalName = *reinterpret_cast(newName); - return UntypedSnapshotRNTupleHelper{finalName, fDirName, fNTupleName, - fInputFieldNames, fOutputFieldNames, fOptions, - fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs}; + return UntypedSnapshotRNTupleHelper{ + fNSlots, finalName, fDirName, fNTupleName, fInputFieldNames, + fOutputFieldNames, fOptions, fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs}; } diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx index 5a1c268f848e2..1ffaa66be0330 100644 --- a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -614,23 +614,25 @@ struct TIMTEnabler { ~TIMTEnabler() { ROOT::DisableImplicitMT(); } }; -TEST(RDFSnapshotRNTuple, ThrowIfMT) +TEST(RDFSnapshotRNTuple, WithMT) { TIMTEnabler _(4); - FileRAII fileGuard{"RDFSnapshotRNTuple_throw_if_mt.root"}; + FileRAII fileGuard{"RDFSnapshotRNTuple_mt.root"}; - auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + auto df = ROOT::RDataFrame(25ull).Define("x", [](ULong64_t e) { return e; }, {"rdfentry_"}); RSnapshotOptions opts; opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; - try { - auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts); - *sdf; - FAIL() << "MT snapshotting to RNTuple is not supported yet"; - } catch (const std::runtime_error &err) { - EXPECT_STREQ(err.what(), "Snapshot: Snapshotting to RNTuple with IMT enabled is not supported yet."); - } + auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts); + *sdf; + + auto sum = sdf->Sum("x"); + EXPECT_EQ(300, sum.GetValue()); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + EXPECT_EQ(25, reader->GetNEntries()); + // There should be more than one cluster, but this is not guaranteed because of scheduling... } #endif // R__USE_IMT From 6fd522fac2dc1de4797b4f38b0ceed47647287a1 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 11 Aug 2025 14:29:39 +0200 Subject: [PATCH 7/7] [df] Fix warning about untriggered lazy RNTuple snapshot Use the same conditions as TTree, looking at fOutputFile instead of the data source. --- tree/dataframe/src/RDFSnapshotHelpers.cxx | 2 +- tree/dataframe/test/dataframe_snapshot_ntuple.cxx | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index e55c253eadf49..98c9cafaa9b3f 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -829,7 +829,7 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper &ROOT::Internal::RDF::UntypedS ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::~UntypedSnapshotRNTupleHelper() { - if (!fNTupleName.empty() && !fOutputLoopManager->GetDataSource() && fOptions.fLazy) + if (!fNTupleName.empty() /* not moved from */ && !fOutputFile /* did not run */ && fOptions.fLazy) Warning("Snapshot", "A lazy Snapshot action was booked but never triggered."); } diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx index 1ffaa66be0330..64323c0650f4f 100644 --- a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -72,6 +72,18 @@ TEST(RDFSnapshotRNTuple, FromScratch) } } +TEST(RDFSnapshotRNTuple, LazyTriggered) +{ + FileRAII fileGuard{"RDFSnapshotRNTuple_lazy.root"}; + auto d = ROOT::RDataFrame(1); + ROOT::RDF::RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fLazy = true; + auto r = d.Snapshot("t", fileGuard.GetPath(), {"rdfentry_"}, opts); + *r; + r = {}; +} + void BookLazySnapshot(std::string_view filename) { auto d = ROOT::RDataFrame(1);