anduin revised this gist . Go to revision
1 file changed, 0 insertions, 0 deletions
pi.cu renamed to pi.cpp
File renamed without changes
anduin revised this gist . Go to revision
1 file changed, 41 insertions
pi.cu(file created)
@@ -0,0 +1,41 @@ | |||
1 | + | #include <iostream> | |
2 | + | #include <curand.h> | |
3 | + | #include <curand_kernel.h> | |
4 | + | ||
5 | + | __global__ void calculate_pi(int *count, unsigned long seed) { | |
6 | + | int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
7 | + | curandState state; | |
8 | + | curand_init(seed, idx, 0, &state); | |
9 | + | ||
10 | + | float x = curand_uniform(&state); | |
11 | + | float y = curand_uniform(&state); | |
12 | + | if (x * x + y * y <= 1.0f) { | |
13 | + | atomicAdd(count, 1); | |
14 | + | } | |
15 | + | } | |
16 | + | ||
17 | + | int main() { | |
18 | + | int N = 1000000; | |
19 | + | int *d_count, h_count = 0; | |
20 | + | ||
21 | + | // Allocate memory on the device | |
22 | + | cudaMalloc((void**)&d_count, sizeof(int)); | |
23 | + | cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice); | |
24 | + | ||
25 | + | // Launch kernel | |
26 | + | int threadsPerBlock = 1024; | |
27 | + | int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; | |
28 | + | calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL)); | |
29 | + | ||
30 | + | // Copy result back to host | |
31 | + | cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); | |
32 | + | ||
33 | + | // Calculate pi | |
34 | + | float pi = 4.0f * h_count / N; | |
35 | + | std::cout << "Estimated Pi = " << pi << std::endl; | |
36 | + | ||
37 | + | // Free device memory | |
38 | + | cudaFree(d_count); | |
39 | + | ||
40 | + | return 0; | |
41 | + | } |