From 34f4d14daa0f8d21fb45baa7a6b0a587045a78e2 Mon Sep 17 00:00:00 2001 From: mp Date: Tue, 18 Feb 2020 13:07:44 -0800 Subject: [PATCH] added chapter05 examples... --- Chapter05/conway_gpu_streams.py | 104 +++++++++++++++++++++++ Chapter05/gpu_mandelbrot_context_sync.py | 79 +++++++++++++++++ Chapter05/multi-kernel.py | 60 +++++++++++++ Chapter05/multi-kernel_events.py | 75 ++++++++++++++++ Chapter05/multi-kernel_multi-thread.py | 84 ++++++++++++++++++ Chapter05/multi-kernel_streams.py | 64 ++++++++++++++ Chapter05/simple_context_create.py | 12 +++ Chapter05/simple_event_example.py | 47 ++++++++++ Chapter05/single_thread_example.py | 20 +++++ 9 files changed, 545 insertions(+) create mode 100644 Chapter05/conway_gpu_streams.py create mode 100644 Chapter05/gpu_mandelbrot_context_sync.py create mode 100644 Chapter05/multi-kernel.py create mode 100644 Chapter05/multi-kernel_events.py create mode 100644 Chapter05/multi-kernel_multi-thread.py create mode 100644 Chapter05/multi-kernel_streams.py create mode 100644 Chapter05/simple_context_create.py create mode 100644 Chapter05/simple_event_example.py create mode 100644 Chapter05/single_thread_example.py diff --git a/Chapter05/conway_gpu_streams.py b/Chapter05/conway_gpu_streams.py new file mode 100644 index 0000000..0e16925 --- /dev/null +++ b/Chapter05/conway_gpu_streams.py @@ -0,0 +1,104 @@ +# CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C +# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" + +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.animation as animation + +ker = SourceModule(""" +#define _X ( threadIdx.x + blockIdx.x * blockDim.x ) +#define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) + +#define _WIDTH ( blockDim.x * gridDim.x ) +#define _HEIGHT ( blockDim.y * gridDim.y ) + +#define _XM(x) ( (x + _WIDTH) % _WIDTH ) +#define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) + +#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) + +// return the number of living neighbors for a given cell +__device__ int nbrs(int x, int y, int * in) +{ + return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ + + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ + + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); +} + +__global__ void conway_ker(int * lattice_out, int * lattice ) +{ + // x, y are the appropriate values for the cell covered by this thread + int x = _X, y = _Y; + + // count the number of neighbors around the current cell + int n = nbrs(x, y, lattice); + + + // if the current cell is alive, then determine if it lives or dies for the next generation. + if ( lattice[_INDEX(x,y)] == 1) + switch(n) + { + // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. + case 2: + case 3: lattice_out[_INDEX(x,y)] = 1; + break; + default: lattice_out[_INDEX(x,y)] = 0; + } + else if( lattice[_INDEX(x,y)] == 0 ) + switch(n) + { + // a dead cell comes to life only if it has 3 neighbors that are alive. + case 3: lattice_out[_INDEX(x,y)] = 1; + break; + default: lattice_out[_INDEX(x,y)] = 0; + } + +} +""") + + +conway_ker = ker.get_function("conway_ker") + + +def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent): + + for k in range(num_concurrent): + conway_ker( newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k] ) + + imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) ) + + lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k]) + + + return imgs + + +if __name__ == '__main__': + # set lattice size + N = 128 + + num_concurrent = 4 + + streams = [] + lattices_gpu = [] + newLattices_gpu = [] + + for k in range(num_concurrent): + streams.append(drv.Stream()) + lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) ) + lattices_gpu.append(gpuarray.to_gpu(lattice)) + newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k])) + + fig, ax = plt.subplots(nrows=1, ncols=num_concurrent) + imgs = [] + + for k in range(num_concurrent): + imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') ) + + ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000) + + plt.show() diff --git a/Chapter05/gpu_mandelbrot_context_sync.py b/Chapter05/gpu_mandelbrot_context_sync.py new file mode 100644 index 0000000..d1688c0 --- /dev/null +++ b/Chapter05/gpu_mandelbrot_context_sync.py @@ -0,0 +1,79 @@ +from time import time +import matplotlib +#this will prevent the figure from popping up +matplotlib.use('Agg') +from matplotlib import pyplot as plt +import numpy as np +import pycuda.autoinit +from pycuda import gpuarray +from pycuda.elementwise import ElementwiseKernel + +mandel_ker = ElementwiseKernel( +"pycuda::complex *lattice, float *mandelbrot_graph, int max_iters, float upper_bound", +""" +mandelbrot_graph[i] = 1; + +pycuda::complex c = lattice[i]; +pycuda::complex z(0,0); + +for (int j = 0; j < max_iters; j++) + { + + z = z*z + c; + + if(abs(z) > upper_bound) + { + mandelbrot_graph[i] = 0; + break; + } + + } + +""", +"mandel_ker") + +def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): + + # we set up our complex lattice as such + real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64) + imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j + mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64) + + # copy complex lattice to the GPU + mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice) + + # synchronize in current context + pycuda.autoinit.context.synchronize() + + # allocate an empty array on the GPU + mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32) + + mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound)) + + pycuda.autoinit.context.synchronize() + + mandelbrot_graph = mandelbrot_graph_gpu.get_async() + + pycuda.autoinit.context.synchronize() + + return mandelbrot_graph + + +if __name__ == '__main__': + + t1 = time() + mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2) + t2 = time() + + mandel_time = t2 - t1 + + t1 = time() + fig = plt.figure(1) + plt.imshow(mandel, extent=(-2, 2, -2, 2)) + plt.savefig('mandelbrot.png', dpi=fig.dpi) + t2 = time() + + dump_time = t2 - t1 + + print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time) + print 'It took {} seconds to dump the image.'.format(dump_time) diff --git a/Chapter05/multi-kernel.py b/Chapter05/multi-kernel.py new file mode 100644 index 0000000..e3dfc98 --- /dev/null +++ b/Chapter05/multi-kernel.py @@ -0,0 +1,60 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from time import time + +num_arrays = 200 +array_len = 1024**2 + +ker = SourceModule(""" +__global__ void mult_ker(float * array, int array_len) +{ + int thd = blockIdx.x*blockDim.x + threadIdx.x; + int num_iters = array_len / blockDim.x; + + for(int j=0; j < num_iters; j++) + { + int i = j * blockDim.x + thd; + + for(int k = 0; k < 50; k++) + { + array[i] *= 2.0; + array[i] /= 2.0; + } + } + +} +""") + +mult_ker = ker.get_function('mult_ker') + +data = [] +data_gpu = [] +gpu_out = [] + +# generate random arrays. +for _ in range(num_arrays): + data.append(np.random.randn(array_len).astype('float32')) + +t_start = time() + +# copy arrays to GPU. +for k in range(num_arrays): + data_gpu.append(gpuarray.to_gpu(data[k])) + +# process arrays. +for k in range(num_arrays): + mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1)) + +# copy arrays from GPU. +for k in range(num_arrays): + gpu_out.append(data_gpu[k].get()) + +t_end = time() + +for k in range(num_arrays): + assert (np.allclose(gpu_out[k], data[k])) + +print('Total time: %f' % (t_end - t_start)) diff --git a/Chapter05/multi-kernel_events.py b/Chapter05/multi-kernel_events.py new file mode 100644 index 0000000..8d60e00 --- /dev/null +++ b/Chapter05/multi-kernel_events.py @@ -0,0 +1,75 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from time import time + +num_arrays = 200 +array_len = 1024**2 + +ker = SourceModule(""" +__global__ void mult_ker(float * array, int array_len) +{ + int thd = blockIdx.x*blockDim.x + threadIdx.x; + int num_iters = array_len / blockDim.x; + for(int j=0; j < num_iters; j++) + { + int i = j * blockDim.x + thd; + for(int k = 0; k < 50; k++) + { + array[i] *= 2.0; + array[i] /= 2.0; + } + } +} +""") + +mult_ker = ker.get_function('mult_ker') + +data = [] +data_gpu = [] +gpu_out = [] +streams = [] +start_events = [] +end_events = [] + +for _ in range(num_arrays): + streams.append(drv.Stream()) + start_events.append(drv.Event()) + end_events.append(drv.Event()) + +# generate random arrays. +for _ in range(num_arrays): + data.append(np.random.randn(array_len).astype('float32')) + +t_start = time() + +# copy arrays to GPU. +for k in range(num_arrays): + data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k])) + +# process arrays. +for k in range(num_arrays): + start_events[k].record(streams[k]) + mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k]) +for k in range(num_arrays): + end_events[k].record(streams[k]) + +# copy arrays from GPU. +for k in range(num_arrays): + gpu_out.append(data_gpu[k].get_async(stream=streams[k])) + +t_end = time() + +for k in range(num_arrays): + assert (np.allclose(gpu_out[k], data[k])) + +kernel_times = [] + +for k in range(num_arrays): + kernel_times.append(start_events[k].time_till(end_events[k])) + +print('Total time: %f' % (t_end - t_start)) +print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times)) +print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times)) diff --git a/Chapter05/multi-kernel_multi-thread.py b/Chapter05/multi-kernel_multi-thread.py new file mode 100644 index 0000000..dcf745d --- /dev/null +++ b/Chapter05/multi-kernel_multi-thread.py @@ -0,0 +1,84 @@ +import pycuda +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from time import time +import threading + + +num_arrays = 10 +array_len = 1024**2 + +kernel_code = """ +__global__ void mult_ker(float * array, int array_len) +{ + int thd = blockIdx.x*blockDim.x + threadIdx.x; + int num_iters = array_len / blockDim.x; + + for(int j=0; j < num_iters; j++) + { + int i = j * blockDim.x + thd; + + for(int k = 0; k < 50; k++) + { + array[i] *= 2.0; + array[i] /= 2.0; + } + } + +} +""" + +class KernelLauncherThread(threading.Thread): + def __init__(self, input_array): + threading.Thread.__init__(self) + self.input_array = input_array + self.output_array = None + + def run(self): + self.dev = drv.Device(0) + self.context = self.dev.make_context() + + self.ker = SourceModule(kernel_code) + + self.mult_ker = self.ker.get_function('mult_ker') + + self.array_gpu = gpuarray.to_gpu(self.input_array) + + self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1)) + + self.output_array = self.array_gpu.get() + + self.context.pop() + + def join(self): + threading.Thread.join(self) + return self.output_array + +drv.init() + + +data = [] +gpu_out = [] +threads = [] + +# generate random arrays and thread objects. +for _ in range(num_arrays): + data.append(np.random.randn(array_len).astype('float32')) + +for k in range(num_arrays): + # create a thread that uses data we just generated + threads.append(KernelLauncherThread(data[k])) + +# launch threads to process arrays. +for k in range(num_arrays): + threads[k].start() + +# get data from launched threads. +for k in range(num_arrays): + gpu_out.append(threads[k].join()) + +for k in range(num_arrays): + assert (np.allclose(gpu_out[k], data[k])) + diff --git a/Chapter05/multi-kernel_streams.py b/Chapter05/multi-kernel_streams.py new file mode 100644 index 0000000..146c334 --- /dev/null +++ b/Chapter05/multi-kernel_streams.py @@ -0,0 +1,64 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from time import time + +num_arrays = 200 +array_len = 1024**2 + +ker = SourceModule(""" +__global__ void mult_ker(float * array, int array_len) +{ + int thd = blockIdx.x*blockDim.x + threadIdx.x; + int num_iters = array_len / blockDim.x; + + for(int j=0; j < num_iters; j++) + { + int i = j * blockDim.x + thd; + + for(int k = 0; k < 50; k++) + { + array[i] *= 2.0; + array[i] /= 2.0; + } + } + +} +""") + +mult_ker = ker.get_function('mult_ker') + +data = [] +data_gpu = [] +gpu_out = [] +streams = [] + +for _ in range(num_arrays): + streams.append(drv.Stream()) + +# generate random arrays. +for _ in range(num_arrays): + data.append(np.random.randn(array_len).astype('float32')) + +t_start = time() + +# copy arrays to GPU. +for k in range(num_arrays): + data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k])) + +# process arrays. +for k in range(num_arrays): + mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k]) + +# copy arrays from GPU. +for k in range(num_arrays): + gpu_out.append(data_gpu[k].get_async(stream=streams[k])) + +t_end = time() + +for k in range(num_arrays): + assert (np.allclose(gpu_out[k], data[k])) + +print('Total time: %f' % (t_end - t_start)) diff --git a/Chapter05/simple_context_create.py b/Chapter05/simple_context_create.py new file mode 100644 index 0000000..2b99316 --- /dev/null +++ b/Chapter05/simple_context_create.py @@ -0,0 +1,12 @@ +import numpy as np +from pycuda import gpuarray +import pycuda.driver as drv + +drv.init() +dev = drv.Device(0) +ctx = dev.make_context() + +x = gpuarray.to_gpu(np.float32([1,2,3])) +print(x.get()) + +ctx.pop() diff --git a/Chapter05/simple_event_example.py b/Chapter05/simple_event_example.py new file mode 100644 index 0000000..5060e9b --- /dev/null +++ b/Chapter05/simple_event_example.py @@ -0,0 +1,47 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +from time import time + +ker = SourceModule(""" +__global__ void mult_ker(float * array, int array_len) +{ + int thd = blockIdx.x*blockDim.x + threadIdx.x; + int num_iters = array_len / blockDim.x; + + for(int j=0; j < num_iters; j++) + { + int i = j * blockDim.x + thd; + + for(int k = 0; k < 50; k++) + { + array[i] *= 2.0; + array[i] /= 2.0; + } + } +} +""") + +mult_ker = ker.get_function('mult_ker') + +array_len = 100*1024**2 + +data = np.random.randn(array_len).astype('float32') +data_gpu = gpuarray.to_gpu(data) + +start_event = drv.Event() +end_event = drv.Event() + +start_event.record() +mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1)) +end_event.record() + +end_event.synchronize() + +print('Has the kernel started yet? {}'.format(start_event.query())) +print('Has the kernel ended yet? {}'.format(end_event.query())) + +print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event)) + diff --git a/Chapter05/single_thread_example.py b/Chapter05/single_thread_example.py new file mode 100644 index 0000000..976ba8d --- /dev/null +++ b/Chapter05/single_thread_example.py @@ -0,0 +1,20 @@ +import threading + +class PointlessExampleThread(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + self.return_value = None + + def run(self): + print('Hello from the thread you just spawned!') + self.return_value = 123 + + def join(self): + threading.Thread.join(self) + return self.return_value + + +NewThread = PointlessExampleThread() +NewThread.start() +thread_output = NewThread.join() +print('The thread completed and returned this value: %s' % thread_output)