diff --git a/Chapter08/monte_carlo_integrator.py b/Chapter08/monte_carlo_integrator.py new file mode 100644 index 0000000..609d63d --- /dev/null +++ b/Chapter08/monte_carlo_integrator.py @@ -0,0 +1,141 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np + +# https://docs.nvidia.com/cuda/cuda-math-api/index.html + +MonteCarloKernelTemplate = ''' +#include + +#define ULL unsigned long long +#define _R(z) ( 1.0f / (z) ) +#define _P2(z) ( (z) * (z) ) + +// p stands for "precision" (single or double) +__device__ inline %(p)s f(%(p)s x) +{ + %(p)s y; + + %(math_function)s; + + return y; +} + + +extern "C" { +__global__ void monte_carlo(int iters, %(p)s lo, %(p)s hi, %(p)s * ys_out) +{ + curandState cr_state; + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + int num_threads = blockDim.x * gridDim.x; + + %(p)s t_width = (hi - lo) / ( %(p)s ) num_threads; + + %(p)s density = ( ( %(p)s ) iters ) / t_width; + + %(p)s t_lo = t_width*tid + lo; + %(p)s t_hi = t_lo + t_width; + + + curand_init( (ULL) clock() + (ULL) tid, (ULL) 0, (ULL) 0, &cr_state); + + %(p)s y, y_sum = 0.0f; + + + %(p)s rand_val, x; + for (int i=0; i < iters; i++) + { + rand_val = curand_uniform%(p_curand)s(&cr_state); + + x = t_lo + t_width * rand_val; + + y_sum += f(x); + } + + y = y_sum / density; + + ys_out[tid] = y; +} + +} +''' + + +class MonteCarloIntegrator: + + def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100): + + self.math_function = math_function + + if precision in [None, 's', 'S', 'single', np.float32]: + self.precision = 'float' + self.numpy_precision = np.float32 + self.p_curand = '' + elif precision in ['d','D', 'double', np.float64]: + self.precision = 'double' + self.numpy_precision = np.float64 + self.p_curand = '_double' + else: + raise Exception('precision is invalid datatype!') + + if (hi - lo <= 0): + raise Exception('hi - lo <= 0!') + else: + self.hi = hi + self.lo = lo + + MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function} + + self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict + + self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode) + + self.f = self.ker.get_function('monte_carlo') + + self.num_blocks = num_blocks + + self.samples_per_thread = samples_per_thread + + + def definite_integral(self, lo=None, hi=None, samples_per_thread=None, num_blocks=None): + + if lo is None or hi is None: + lo = self.lo + hi = self.hi + + if samples_per_thread is None: + samples_per_thread = self.samples_per_thread + + if num_blocks is None: + num_blocks = self.num_blocks + grid = (num_blocks,1,1) + else: + grid = (num_blocks,1,1) + + block = (32,1,1) + + num_threads = 32*num_blocks + + self.ys = gpuarray.empty((num_threads,) , dtype=self.numpy_precision) + + self.f(np.int32(samples_per_thread), self.numpy_precision(lo), self.numpy_precision(hi), self.ys, block=block, grid=grid) + + self.nintegral = np.sum(self.ys.get() ) + + return np.sum(self.nintegral) + + + +if __name__ == '__main__': + + integral_tests = [('y =log(x)*_P2(sin(x))', 11.733 , 18.472, 8.9999), ('y = _R( 1 + sinh(2*x)*_P2(log(x)) )', .9, 4, .584977), ('y = (cosh(x)*sin(x))/ sqrt( pow(x,3) + _P2(sin(x)))', 1.85, 4.81, -3.34553) ] + + + for f, lo, hi, expected in integral_tests: + mci = MonteCarloIntegrator(math_function=f, precision='d', lo=lo, hi=hi) + print('The Monte Carlo numerical integration of the function\n \t f: x -> %s \n \t from x = %s to x = %s is : %s ' % (f, lo, hi, mci.definite_integral())) + print('where the expected value is : %s\n' % expected) diff --git a/Chapter08/monte_carlo_pi.py b/Chapter08/monte_carlo_pi.py new file mode 100644 index 0000000..cab4475 --- /dev/null +++ b/Chapter08/monte_carlo_pi.py @@ -0,0 +1,70 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from sympy import Rational + +ker = SourceModule(no_extern_c=True ,source=''' +#include +#define _PYTHAG(a,b) (a*a + b*b) +#define ULL unsigned long long + +extern "C" { + +__global__ void estimate_pi(ULL iters, ULL * hits) +{ + + curandState cr_state; + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + curand_init( (ULL) clock() + (ULL) tid, (ULL) 0, \ + (ULL) 0, &cr_state); + + float x, y; + + for(ULL i=0; i < iters; i++) + { + + x = curand_uniform(&cr_state); + y = curand_uniform(&cr_state); + + + if(_PYTHAG(x,y) <= 1.0f) + hits[tid]++; + } + + return; + +} + +}// (End of 'extern "C"' here) +''') + + + +pi_ker = ker.get_function("estimate_pi") + +threads_per_block = 32 +blocks_per_grid = 512 + +total_threads = threads_per_block * blocks_per_grid + +hits_d = gpuarray.zeros((total_threads,),dtype=np.uint64) + +iters = 2**24 + +pi_ker(np.uint64(iters), hits_d, grid=(blocks_per_grid,1,1), block=(threads_per_block,1,1)) + +total_hits = np.sum( hits_d.get() ) +total = np.uint64(total_threads) * np.uint64(iters) + +est_pi_symbolic = Rational(4)*Rational(int(total_hits), int(total) ) + +est_pi = np.float(est_pi_symbolic.evalf()) + +print("Our Monte Carlo estimate of Pi is : %s" % est_pi) +print("NumPy's Pi constant is: %s " % np.pi) + +print("Our estimate passes NumPy's 'allclose' : %s" % np.allclose(est_pi, np.pi)) diff --git a/Chapter08/thrust_dot_product.cu b/Chapter08/thrust_dot_product.cu new file mode 100644 index 0000000..f73685f --- /dev/null +++ b/Chapter08/thrust_dot_product.cu @@ -0,0 +1,49 @@ +#include +#include +#include + +using namespace std; + +struct multiply_functor { + + float w; + + multiply_functor(float _w = 1) : w(_w) {} + + __device__ float operator() (const float & x, const float & y) { + return w * x * y; + } +}; + +float dot_product(thrust::device_vector &v, thrust::device_vector &w ) //, thrust::device_vector &z) +{ + thrust::device_vector z(v.size()); + + thrust::transform(v.begin(), v.end(), w.begin(), z.begin(), multiply_functor()); + + return thrust::reduce(z.begin(), z.end()); +} + +int main(void) +{ + + thrust::device_vector v; + + v.push_back(1.0f); + v.push_back(2.0f); + v.push_back(3.0f); + + thrust::device_vector w(3); + + thrust::fill(w.begin(), w.end(), 1.0f); + + for (int i = 0; i < v.size(); i++) + cout << "v[" << i << "] == " << v[i] << endl; + + for (int i = 0; i < w.size(); i++) + cout << "w[" << i << "] == " << w[i] << endl; + + cout << "dot_product(v , w) == " << dot_product(v,w) << endl; + + return 0; +}