pi.cpp
· 1.1 KiB · C++
Raw
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
__global__ void calculate_pi(int *count, unsigned long seed) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
float x = curand_uniform(&state);
float y = curand_uniform(&state);
if (x * x + y * y <= 1.0f) {
atomicAdd(count, 1);
}
}
int main() {
int N = 1000000;
int *d_count, h_count = 0;
// Allocate memory on the device
cudaMalloc((void**)&d_count, sizeof(int));
cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
int threadsPerBlock = 1024;
int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL));
// Copy result back to host
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
// Calculate pi
float pi = 4.0f * h_count / N;
std::cout << "Estimated Pi = " << pi << std::endl;
// Free device memory
cudaFree(d_count);
return 0;
}
| 1 | #include <iostream> |
| 2 | #include <curand.h> |
| 3 | #include <curand_kernel.h> |
| 4 | |
| 5 | __global__ void calculate_pi(int *count, unsigned long seed) { |
| 6 | int idx = threadIdx.x + blockIdx.x * blockDim.x; |
| 7 | curandState state; |
| 8 | curand_init(seed, idx, 0, &state); |
| 9 | |
| 10 | float x = curand_uniform(&state); |
| 11 | float y = curand_uniform(&state); |
| 12 | if (x * x + y * y <= 1.0f) { |
| 13 | atomicAdd(count, 1); |
| 14 | } |
| 15 | } |
| 16 | |
| 17 | int main() { |
| 18 | int N = 1000000; |
| 19 | int *d_count, h_count = 0; |
| 20 | |
| 21 | // Allocate memory on the device |
| 22 | cudaMalloc((void**)&d_count, sizeof(int)); |
| 23 | cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice); |
| 24 | |
| 25 | // Launch kernel |
| 26 | int threadsPerBlock = 1024; |
| 27 | int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; |
| 28 | calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL)); |
| 29 | |
| 30 | // Copy result back to host |
| 31 | cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); |
| 32 | |
| 33 | // Calculate pi |
| 34 | float pi = 4.0f * h_count / N; |
| 35 | std::cout << "Estimated Pi = " << pi << std::endl; |
| 36 | |
| 37 | // Free device memory |
| 38 | cudaFree(d_count); |
| 39 | |
| 40 | return 0; |
| 41 | } |
| 42 |