@@ -247,7 +247,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
247
247
}
248
248
249
249
// Kernels generating random numbers take extra (seed, offset) arguments
250
- if (kernel_summary.is_stochastic ) {
250
+ if (kernel_summary.max_rng_offsets >= 0 ) {
251
251
code_ << " , at::PhiloxCudaState philox_args" ;
252
252
}
253
253
@@ -259,14 +259,14 @@ class CudaKernelGenerator : private OptOutConstDispatch {
259
259
const auto & kernel_summary = kernel_->summary ();
260
260
261
261
// Random number generator (optional)
262
- if (kernel_summary.is_stochastic ) {
263
- indent ()
264
- << " const auto idx = ((((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;" ;
262
+ if (kernel_summary.max_rng_offsets >= 0 ) {
265
263
indent () << " auto offset = philox_args.captured_ ?\n " ;
266
264
indent ()
267
265
<< " static_cast<uint64_t>(*(philox_args.offset_.ptr) + philox_args.offset_intragraph_) :\n " ;
268
266
indent () << " philox_args.offset_.val;\n " ;
269
- indent () << " Philox rnd(philox_args.seed_, idx, offset);\n " ;
267
+ indent () << " uint4 rng_result;\n " ;
268
+ indent () << " nvfuser_index_t rng_subseq = -1;\n " ;
269
+ indent () << " nvfuser_index_t rng_offset = -1;\n " ;
270
270
}
271
271
272
272
// Do we have any dynamic shared memory buffers?
@@ -695,8 +695,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
695
695
}
696
696
}
697
697
698
+ const auto op_type = uop->getUnaryOpType ();
699
+
698
700
if (uop->out ()->isA <NamedScalar>()) {
699
- const auto op_type = uop->getUnaryOpType ();
700
701
if (auto op = inline_op_str (op_type)) {
701
702
indent () << gen (uop->out ()) << " = " << *op << genInline (uop->in ())
702
703
<< " ;\n " ;
@@ -705,15 +706,36 @@ class CudaKernelGenerator : private OptOutConstDispatch {
705
706
}
706
707
707
708
if (!print_inline_) {
709
+ if (op_type == UnaryOpType::RandLike) {
710
+ auto out_tv = uop->out ()->as <kir::TensorIndex>()->view ();
711
+ auto index = genTensorIndex (uop->out ()->as <kir::TensorIndex>());
712
+ int multiple = out_tv->getDataType () == DataType::Double ? 2 : 4 ;
713
+ indent () << " nvfuser_index_t subseq" << uop->name () << " = (" << index
714
+ << " ) / " << multiple << " ;\n " ;
715
+ indent () << " nvfuser_index_t component" << uop->name () << " = ("
716
+ << index << " ) % " << multiple << " ;\n " ;
717
+ indent () << " nvfuser_index_t offset" << uop->name () << " = "
718
+ << uop->getRNGOffset () << " ;\n " ;
719
+ indent () << " if (rng_subseq != subseq" << uop->name ()
720
+ << " || rng_offset != offset" << uop->name () << " ) {\n " ;
721
+ indent () << " rng_result = philox(philox_args.seed_, subseq"
722
+ << uop->name () << " , offset / 4 + offset" << uop->name ()
723
+ << " );\n " ;
724
+ indent () << " rng_subseq = subseq" << uop->name () << " ;\n " ;
725
+ indent () << " rng_offset = offset" << uop->name () << " ;\n " ;
726
+ indent () << " }\n " ;
727
+ }
728
+
708
729
indent () << gen (uop->out ());
709
730
if (!uop->out ()->isScalar () && !uop->in ()->isScalar ()) {
710
731
code_ << " \n " ;
711
732
indent () << kTab ;
712
733
}
713
734
code_ << " = " ;
735
+ } else {
736
+ TORCH_INTERNAL_ASSERT (op_type != UnaryOpType::RandLike);
714
737
}
715
738
716
- const auto op_type = uop->getUnaryOpType ();
717
739
if (auto op = inline_op_str (op_type)) {
718
740
if (alsoBooleanOperator (op_type) &&
719
741
uop->out ()->dtype () == DataType::Bool) {
@@ -742,7 +764,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
742
764
743
765
code_ << " (" ;
744
766
if (op_type == UnaryOpType::RandLike) {
745
- code_ << " rnd " ;
767
+ code_ << " rng_result, component " << uop-> name () ;
746
768
} else {
747
769
code_ << gen (uop->in ());
748
770
}
0 commit comments