-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathkernelExample.R
More file actions
91 lines (65 loc) · 2.65 KB
/
kernelExample.R
File metadata and controls
91 lines (65 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# modification of one of the RCUDA examples to use use double precision
library(RCUDA)
cat("Setting cuGetContext(TRUE)...\n")
cuGetContext(TRUE)
# compile the kernel into a form that RCUDA can load
# system("nvcc --ptx -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu")
ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx',
target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20")
mod = loadModule(ptx)
calc_loglik = mod$calc_loglik
n = as.integer(134217728)
set.seed(0)
x = runif(n)
mu = 0.3
sigma = 1.5
# setting grid and block dimensions
threads_per_block <- as.integer(1024)
block_dims <- c(threads_per_block, as.integer(1), as.integer(1))
grid_d <- as.integer(ceiling(sqrt(n/threads_per_block)))
grid_dims <- c(grid_d, grid_d, as.integer(1))
cat("Grid size:\n")
print(grid_dims)
nthreads <- as.integer(prod(grid_dims)*prod(block_dims))
cat("Total number of threads to launch = ", nthreads, "\n")
if (nthreads < n){
stop("Grid is not large enough...!")
}
cat("Running CUDA kernel...\n")
# basic usage with manual transfer
tTransferToGPU <- system.time({
dX = copyToDevice(x, strict = TRUE)
cudaDeviceSynchronize()
})
tCalc <- system.time({
.cuda(calc_loglik, dX, n, mu, sigma, gridDim = grid_dims, blockDim = block_dims, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
cudaDeviceSynchronize()
})
tTransferFromGPU <- system.time({
out = copyFromDevice(obj = dX, nels = dX@nels, type = "double")
cudaDeviceSynchronize()
})
cat("Input values: ", x[1:3], "\n")
cat("Output values: ", out[1:3], "\n")
# alternative that bundles transfer and computation all in one, with
# implicit transfer done by RCUDA behind the scenes
tFull <- system.time({
out <- .cuda(calc_loglik, "x"=x, n, mu, sigma, gridDim=grid_dims, blockDim=block_dims, outputs="x", .numericAsDouble = getOption("CUDA.useDouble", TRUE))
cudaDeviceSynchronize()
})
cat("Output values (implicit transfer): ", out[1:3], "\n")
# having RCUDA determine gridding - not working for some reason
## tCalc_gridby <- system.time({
## .cuda(calc_loglik, dX, n, mu, sigma, gridBy = nthreads, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
## cudaDeviceSynchronize()
## })
tCalc_R <- system.time({
out <- dnorm(x, mu, sigma)
})
cat("Output values (CPU with R): ", out[1:3], "\n")
cat("Transfer to GPU time: ", tTransferToGPU[3], "\n")
cat("Calculation time (GPU): ", tCalc[3], "\n")
cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
cat("Calculation time (CPU): ", tCalc_R[3], "\n")
cat("Combined calculation/transfer via .cuda time (GPU): ", tFull[3], "\n")
#cat("Calculation time (GPU with gridBy): ", tCalc_gridBy[3], "\n")