From 6400b07110221ceb7cbb209a96a48e76853bf384 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 15:34:31 +0100 Subject: [PATCH 01/10] add development dependencies --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index d5f76e6..acdcf44 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,8 @@ version = "0.2.3" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" [compat] BenchmarkTools = "0.6" From 68fa42018e4f3f8c5bb193f42c453138dc284429 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 15:55:34 +0100 Subject: [PATCH 02/10] replace GPUArrays.gpu_rand by rand(Float32) , remove rng arguments --- src/kernels.jl | 10 +++++----- src/rand_binomial.jl | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/kernels.jl b/src/kernels.jl index eb15901..114b0ee 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -32,7 +32,7 @@ end # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -function kernel_BTRS!(A, count, prob, randstates) +function kernel_BTRS!(A, count, prob) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x indices = CartesianIndices(A) @@ -59,7 +59,7 @@ function kernel_BTRS!(A, count, prob, randstates) k = 0 ctr = 1 while ctr <= n - GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < p && (k += 1) + rand(Float32) < p && (k += 1) ctr += 1 end A[i] = k @@ -72,7 +72,7 @@ function kernel_BTRS!(A, count, prob, randstates) geom_sum = 0f0 num_geom = 0 while true - geom = ceil(CUDA.log(GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates)) / logp) + geom = ceil(CUDA.log(rand(Float32)) / logp) geom_sum += geom geom_sum > n && break num_geom += 1 @@ -99,8 +99,8 @@ function kernel_BTRS!(A, count, prob, randstates) m = floor((n + 1) * p) while true - usample = GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) - 0.5f0 - vsample = GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) + usample = rand(Float32) - 0.5f0 + vsample = rand(Float32) us = 0.5f0 - abs(usample) ks = floor((2 * a / us + b) * usample + c) diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index 2683521..cc4ba00 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -6,7 +6,7 @@ const BinomialType = Union{Type{<:Integer}} const BinomialArray = DenseCuArray{<:Integer} ## exported functions: in-place -rand_binomial!(A::BinomialArray; kwargs...) = rand_binomial!(gpuarrays_rng(), A; kwargs...) +function rand_binomial!(A::BinomialArray; kwargs...) end rand_binomial!(A::AnyCuArray; kwargs...) = error("BinomialGPU.jl does not support generating binomially-distributed random numbers of type $(eltype(A))") @@ -26,52 +26,52 @@ rand_binomial(dim1::Integer, dims::Integer...; kwargs...) = rand_binomial(gpuarrays_rng(), Dims((dim1, dims...)); kwargs...) ## main internal function -function rand_binomial!(rng, A::BinomialArray; count, prob) - return rand_binom!(rng, A, count, prob) +function rand_binomial!(A::BinomialArray; count, prob) + return rand_binom!(A, count, prob) end ## dispatching on parameter types # constant parameters -function rand_binom!(rng, A::BinomialArray, count::Integer, prob::Number) +function rand_binom!(A::BinomialArray, count::Integer, prob::Number) # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster) ns = CUDA.fill(Int(count), size(A)) ps = CUDA.fill(Float32(prob), size(A)) - return rand_binom!(rng, A, ns, ps) + return rand_binom!(A, ns, ps) end # arrays of parameters -function rand_binom!(rng, A::BinomialArray, count::BinomialArray, prob::Number) +function rand_binom!(A::BinomialArray, count::BinomialArray, prob::Number) # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster) cucount = cu(count) ps = CUDA.fill(Float32(prob), size(A)) - return rand_binom!(rng, A, cucount, ps) + return rand_binom!(A, cucount, ps) end -function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractArray{<:Number}) +function rand_binom!(A::BinomialArray, count::Integer, prob::AbstractArray{<:Number}) # revert to full parameter case (this could be suboptimal, as a table-based method should in principle be faster) ns = CUDA.fill(Int(count), size(A)) cuprob = cu(prob) - return rand_binom!(rng, A, ns, cuprob) + return rand_binom!(A, ns, cuprob) end -function rand_binom!(rng, A::BinomialArray, count::BinomialArray, prob::AbstractArray{<:Number}) +function rand_binom!(A::BinomialArray, count::BinomialArray, prob::AbstractArray{<:Number}) cucount = cu(count) cuprob = cu(prob) - return rand_binom!(rng, A, cucount, cuprob) + return rand_binom!(A, cucount, cuprob) end -function rand_binom!(rng, A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32}) +function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{Float32}) if ndims(count) > ndims(A) || ndims(prob) > ndims(A) throw(DimensionMismatch("`count` and `prob` need to be scalar or have less or equal dimensions than A")) return A end if size(A)[1:ndims(count)] == size(count) && size(A)[1:ndims(prob)] == size(prob) - kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob, rng.state) + kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob) config = launch_configuration(kernel.fun) threads = Base.min(length(A), config.threads, 256) # strangely seems to be faster when defaulting to 256 threads blocks = cld(length(A), threads) - kernel(A, count, prob, rng.state; threads=threads, blocks=blocks) + kernel(A, count, prob; threads=threads, blocks=blocks) else throw(DimensionMismatch("`count` and `prob` need have size compatible with A")) end From 377a39a98bb2aec4b3ff38e47345fe17fb4f416c Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 16:37:41 +0100 Subject: [PATCH 03/10] add compat entries --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index acdcf44..afdc628 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,8 @@ LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" BenchmarkTools = "0.6" CUDA = "2" GPUArrays = "6" +GPUCompiler = "0.10" +LLVM = "3.6" julia = "1.5" [extras] From 837182d12c4dd6404f4a42b3d7197bd403d99321 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 21:03:33 +0100 Subject: [PATCH 04/10] fix issue #3: more robust indexing all tests should now pass --- src/kernels.jl | 18 +++++++++++++----- src/rand_binomial.jl | 18 ++++++++++++++++-- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/kernels.jl b/src/kernels.jl index 114b0ee..fca2bca 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -32,14 +32,22 @@ end # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -function kernel_BTRS!(A, count, prob) +function kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - indices = CartesianIndices(A) @inbounds if i <= length(A) - I = indices[i].I - n = count[CartesianIndex(I[1:ndims(count)])] - p = prob[CartesianIndex(I[1:ndims(prob)])] + I = Ra[i] + Ip = Rp[I[1]] + I1 = R1[Ip[1]] + I2 = R2[Ip[2]] + + if count_dim_larger_than_prob_dim + n = count[CartesianIndex(I1, I2)] + p = prob[I1] + else + n = count[I1] + p = prob[CartesianIndex(I1, I2)] + end # wrong parameter values (currently disabled) # n < 0 && throw(ArgumentError("kernel_BTRS!: count must be a nonnegative integer.")) diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index cc4ba00..4c04d49 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -67,11 +67,25 @@ function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{ return A end if size(A)[1:ndims(count)] == size(count) && size(A)[1:ndims(prob)] == size(prob) - kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob) + count_dim_larger_than_prob_dim = ndims(count) > ndims(prob) + if count_dim_larger_than_prob_dim + R1 = CartesianIndices(prob) # indices for count + R2 = CartesianIndices(size(count)[ndims(prob)+1:end]) # indices for prob that are not included in R1 + Rr = CartesianIndices(size(A)[ndims(count)+1:end]) # remaining indices in A + else + R1 = CartesianIndices(count) # indices for count + R2 = CartesianIndices(size(prob)[ndims(count)+1:end]) # indices for prob that are not included in R1 + Rr = CartesianIndices(size(A)[ndims(prob)+1:end]) # remaining indices in A + end + Rp = CartesianIndices((length(R1), length(R2))) # indices for parameters + Ra = CartesianIndices((length(Rp), length(Rr))) # indices for parameters and A + + kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) config = launch_configuration(kernel.fun) threads = Base.min(length(A), config.threads, 256) # strangely seems to be faster when defaulting to 256 threads blocks = cld(length(A), threads) - kernel(A, count, prob; threads=threads, blocks=blocks) + + kernel(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim; threads=threads, blocks=blocks) else throw(DimensionMismatch("`count` and `prob` need have size compatible with A")) end From 64c037ffc08c05ca0b4c4cb5384713d45718ead7 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 21:03:33 +0100 Subject: [PATCH 05/10] Revert "fix issue #3: more robust indexing" This reverts commit 837182d12c4dd6404f4a42b3d7197bd403d99321. --- src/kernels.jl | 18 +++++------------- src/rand_binomial.jl | 18 ++---------------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/src/kernels.jl b/src/kernels.jl index fca2bca..114b0ee 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -32,22 +32,14 @@ end # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -function kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) +function kernel_BTRS!(A, count, prob) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + indices = CartesianIndices(A) @inbounds if i <= length(A) - I = Ra[i] - Ip = Rp[I[1]] - I1 = R1[Ip[1]] - I2 = R2[Ip[2]] - - if count_dim_larger_than_prob_dim - n = count[CartesianIndex(I1, I2)] - p = prob[I1] - else - n = count[I1] - p = prob[CartesianIndex(I1, I2)] - end + I = indices[i].I + n = count[CartesianIndex(I[1:ndims(count)])] + p = prob[CartesianIndex(I[1:ndims(prob)])] # wrong parameter values (currently disabled) # n < 0 && throw(ArgumentError("kernel_BTRS!: count must be a nonnegative integer.")) diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index 4c04d49..cc4ba00 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -67,25 +67,11 @@ function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{ return A end if size(A)[1:ndims(count)] == size(count) && size(A)[1:ndims(prob)] == size(prob) - count_dim_larger_than_prob_dim = ndims(count) > ndims(prob) - if count_dim_larger_than_prob_dim - R1 = CartesianIndices(prob) # indices for count - R2 = CartesianIndices(size(count)[ndims(prob)+1:end]) # indices for prob that are not included in R1 - Rr = CartesianIndices(size(A)[ndims(count)+1:end]) # remaining indices in A - else - R1 = CartesianIndices(count) # indices for count - R2 = CartesianIndices(size(prob)[ndims(count)+1:end]) # indices for prob that are not included in R1 - Rr = CartesianIndices(size(A)[ndims(prob)+1:end]) # remaining indices in A - end - Rp = CartesianIndices((length(R1), length(R2))) # indices for parameters - Ra = CartesianIndices((length(Rp), length(Rr))) # indices for parameters and A - - kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) + kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob) config = launch_configuration(kernel.fun) threads = Base.min(length(A), config.threads, 256) # strangely seems to be faster when defaulting to 256 threads blocks = cld(length(A), threads) - - kernel(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim; threads=threads, blocks=blocks) + kernel(A, count, prob; threads=threads, blocks=blocks) else throw(DimensionMismatch("`count` and `prob` need have size compatible with A")) end From 9a6b02017b0bfc97a44af2afb1ae1e1890223c0c Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 21:29:08 +0100 Subject: [PATCH 06/10] remove spurious rng argument --- src/rand_binomial.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index 814d735..cfd201d 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -1,7 +1,5 @@ ## extend the CUDA.jl functionality (rand, randn, rand_poisson, etc.) to include binomial distributions -gpuarrays_rng() = GPUArrays.default_rng(CuArray) - const BinomialType = Union{Type{<:Integer}} const BinomialArray = DenseCuArray{<:Integer} @@ -80,12 +78,12 @@ function rand_binom!(A::BinomialArray, count::BinomialArray, prob::DenseCuArray{ Rp = CartesianIndices((length(R1), length(R2))) # indices for parameters Ra = CartesianIndices((length(Rp), length(Rr))) # indices for parameters and A - kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob, rng.state, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) + kernel = @cuda name="BTRS_full" launch=false kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) config = launch_configuration(kernel.fun) threads = Base.min(length(A), config.threads, 256) # strangely seems to be faster when defaulting to 256 threads blocks = cld(length(A), threads) - kernel(A, count, prob, rng.state, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim; threads=threads, blocks=blocks) + kernel(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim; threads=threads, blocks=blocks) else throw(DimensionMismatch("`count` and `prob` need have size compatible with A")) end From 021b6230396c552bc369a239ef9c7e630315e9b2 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 22:22:40 +0100 Subject: [PATCH 07/10] remove randstates argument --- src/kernels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels.jl b/src/kernels.jl index 7ae1295..fca2bca 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -32,7 +32,7 @@ end # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc) -function kernel_BTRS!(A, count, prob, randstates, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) +function kernel_BTRS!(A, count, prob, R1, R2, Rp, Ra, count_dim_larger_than_prob_dim) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x @inbounds if i <= length(A) From 1c97e8a7f5252115dc2c4503fcbbbdf671bf999d Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 22:23:52 +0100 Subject: [PATCH 08/10] remove support for Julia 1.5 --- .buildkite/pipeline.yml | 20 -------------------- Project.toml | 2 +- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d5e96d2..131a630 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,26 +1,6 @@ steps: # Julia versions - - label: "Julia 1.5, CUDA 11.2" - plugins: - - JuliaCI/julia#v1: - version: 1.5 - - JuliaCI/julia-test#v1: - test_args: "--thorough" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - agents: - queue: "juliagpu" - cuda: "11.2" - cap: "recent" - env: - JULIA_CUDA_VERSION: '11.2' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 120 - - label: "Julia 1.6, CUDA 11.2" plugins: - JuliaCI/julia#v1: diff --git a/Project.toml b/Project.toml index afdc628..013da7b 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ CUDA = "2" GPUArrays = "6" GPUCompiler = "0.10" LLVM = "3.6" -julia = "1.5" +julia = "1.6" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From 04ba94a338adf2e051533a6696102cd67aff0730 Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 22:51:31 +0100 Subject: [PATCH 09/10] remove GPUArrays dep, add manifest --- .gitignore | 1 - Manifest.toml | 302 +++++++++++++++++++++++++++++++++++++++++++ Project.toml | 6 - src/BinomialGPU.jl | 1 - src/kernels.jl | 4 +- src/rand_binomial.jl | 7 +- 6 files changed, 307 insertions(+), 14 deletions(-) create mode 100644 Manifest.toml diff --git a/.gitignore b/.gitignore index 0ecfb73..287c938 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.jl.*.cov *.jl.cov *.jl.mem -/Manifest.toml test.jl diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..e296f3d --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,302 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "ffcfa2d345aaee0ef3d8346a073d5dd03c983ebe" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.2.0" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BenchmarkTools]] +deps = ["JSON", "Logging", "Printf", "Statistics", "UUIDs"] +git-tree-sha1 = "8b8279aa9b15b4ee2d0e06bc5208f486a8ad65cc" +uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +version = "0.6.0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"] +git-tree-sha1 = "b6635c2faaa48550e2cef0c775c0931cb51a8006" +repo-rev = "tb/speedup_rand" +repo-url = "https://github.com/JuliaGPU/CUDA.jl.git" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "2.6.0" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "0893f8d90331a0f5223c7ef2a8868464394a886c" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "0.9.33" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "919c7f3151e79ff196add81d7f4e45d91bbf420b" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.25.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.9" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.3" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"] +git-tree-sha1 = "f99a25fe0313121f2f9627002734c7d63b4dd3bd" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "6.2.0" + +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "ef2839b063e158672583b9c09d2cf4876a8d3d55" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.10.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.2.0" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.1" + +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.6.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9db77584158d0ab52307f8c04f8e7c08ca76b5b3" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.3+4" + +[[OrderedCollections]] +git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.0" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "c8abc88faa3f7a3950832ac5d6e690881590d6dc" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.1.0" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.4.0" + +[[Reexport]] +git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.0.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.0.3" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "OpenSpecFun_jll"] +git-tree-sha1 = "5919936c0e92cff40e57d0ddf0ceb667d42e5902" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.3.0" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TimerOutputs]] +deps = ["Printf"] +git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.8" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/Project.toml b/Project.toml index 013da7b..f8114e9 100644 --- a/Project.toml +++ b/Project.toml @@ -6,16 +6,10 @@ version = "0.2.3" [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" -LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" [compat] BenchmarkTools = "0.6" CUDA = "2" -GPUArrays = "6" -GPUCompiler = "0.10" -LLVM = "3.6" julia = "1.6" [extras] diff --git a/src/BinomialGPU.jl b/src/BinomialGPU.jl index 5e97da6..4a949d6 100644 --- a/src/BinomialGPU.jl +++ b/src/BinomialGPU.jl @@ -1,7 +1,6 @@ module BinomialGPU using CUDA -using GPUArrays # user-level API include("rand_binomial.jl") diff --git a/src/kernels.jl b/src/kernels.jl index fca2bca..661588e 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -143,14 +143,14 @@ end ## old, unused kernels (for reference) #naive algorithm, full -function kernel_naive_full!(A, count, prob, randstates) +function kernel_naive_full!(A, count, prob) index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x stride1 = blockDim().x * gridDim().x @inbounds for i in index1:stride1:length(A) A[i] = 0 for m in 1:count[i] - @inbounds A[i] += GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < prob[i] + @inbounds A[i] += rand(Float32) < prob[i] end end return diff --git a/src/rand_binomial.jl b/src/rand_binomial.jl index cfd201d..afe5a68 100644 --- a/src/rand_binomial.jl +++ b/src/rand_binomial.jl @@ -10,18 +10,17 @@ rand_binomial!(A::AnyCuArray; kwargs...) = error("BinomialGPU.jl does not support generating binomially-distributed random numbers of type $(eltype(A))") ## unexported functions: out of place -rand_binomial(T::BinomialType, dims::Dims; kwargs...) = rand_binomial(gpuarrays_rng(), T, dims; kwargs...) +function rand_binomial(T::BinomialType, dims::Dims; kwargs...) end rand_binomial(T::BinomialType, dim1::Integer, dims::Integer...; kwargs...) = - rand_binomial(gpuarrays_rng(), T, Dims((dim1, dims...)); kwargs...) + rand_binomial(T, Dims((dim1, dims...)); kwargs...) rand_binomial(T::Type, dims::Dims; kwargs...) = rand_binomial!(CuArray{T}(undef, dims...); kwargs...) rand_binomial(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_binomial!(CuArray{T}(undef, dim1, dims...); kwargs...) -rand_binomial(dim1::Integer, dims::Integer...; kwargs...) = - rand_binomial(gpuarrays_rng(), Dims((dim1, dims...)); kwargs...) +rand_binomial(dim1::Integer, dims::Integer...; kwargs...) = rand_binomial(Dims((dim1, dims...)); kwargs...) ## main internal function function rand_binomial!(A::BinomialArray; count, prob) From 764036629afecd1a18f42c4c50ff369c1fa8cb6b Mon Sep 17 00:00:00 2001 From: Simone Carlo Surace Date: Fri, 26 Mar 2021 23:16:39 +0100 Subject: [PATCH 10/10] add GPUCompiler dep by hand --- Manifest.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index e296f3d..166852a 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -97,9 +97,11 @@ version = "6.2.0" [[GPUCompiler]] deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "ef2839b063e158672583b9c09d2cf4876a8d3d55" +git-tree-sha1 = "b6c3b8e2df6ffe0da0b10e2045ce35a3cf618b8a" +repo-rev = "1ecbe42" +repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.10.0" +version = "0.10.1" [[InteractiveUtils]] deps = ["Markdown"]