#include #include #include __global__ void calculate_pi(int *count, unsigned long seed) { int idx = threadIdx.x + blockIdx.x * blockDim.x; curandState state; curand_init(seed, idx, 0, &state); float x = curand_uniform(&state); float y = curand_uniform(&state); if (x * x + y * y <= 1.0f) { atomicAdd(count, 1); } } int main() { int N = 1000000; int *d_count, h_count = 0; // Allocate memory on the device cudaMalloc((void**)&d_count, sizeof(int)); cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice); // Launch kernel int threadsPerBlock = 1024; int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; calculate_pi<<>>(d_count, time(NULL)); // Copy result back to host cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); // Calculate pi float pi = 4.0f * h_count / N; std::cout << "Estimated Pi = " << pi << std::endl; // Free device memory cudaFree(d_count); return 0; }