added chapter05 examples...

This commit is contained in:
mp
2020-02-18 13:07:44 -08:00
parent bf8f8502cf
commit 34f4d14daa
9 changed files with 545 additions and 0 deletions

View File

@@ -0,0 +1,104 @@
# CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C
# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
ker = SourceModule("""
#define _X ( threadIdx.x + blockIdx.x * blockDim.x )
#define _Y ( threadIdx.y + blockIdx.y * blockDim.y )
#define _WIDTH ( blockDim.x * gridDim.x )
#define _HEIGHT ( blockDim.y * gridDim.y )
#define _XM(x) ( (x + _WIDTH) % _WIDTH )
#define _YM(y) ( (y + _HEIGHT) % _HEIGHT )
#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH )
// return the number of living neighbors for a given cell
__device__ int nbrs(int x, int y, int * in)
{
return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
+ in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
+ in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
}
__global__ void conway_ker(int * lattice_out, int * lattice )
{
// x, y are the appropriate values for the cell covered by this thread
int x = _X, y = _Y;
// count the number of neighbors around the current cell
int n = nbrs(x, y, lattice);
// if the current cell is alive, then determine if it lives or dies for the next generation.
if ( lattice[_INDEX(x,y)] == 1)
switch(n)
{
// if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
case 2:
case 3: lattice_out[_INDEX(x,y)] = 1;
break;
default: lattice_out[_INDEX(x,y)] = 0;
}
else if( lattice[_INDEX(x,y)] == 0 )
switch(n)
{
// a dead cell comes to life only if it has 3 neighbors that are alive.
case 3: lattice_out[_INDEX(x,y)] = 1;
break;
default: lattice_out[_INDEX(x,y)] = 0;
}
}
""")
conway_ker = ker.get_function("conway_ker")
def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent):
for k in range(num_concurrent):
conway_ker( newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k] )
imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) )
lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k])
return imgs
if __name__ == '__main__':
# set lattice size
N = 128
num_concurrent = 4
streams = []
lattices_gpu = []
newLattices_gpu = []
for k in range(num_concurrent):
streams.append(drv.Stream())
lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
lattices_gpu.append(gpuarray.to_gpu(lattice))
newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k]))
fig, ax = plt.subplots(nrows=1, ncols=num_concurrent)
imgs = []
for k in range(num_concurrent):
imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') )
ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000)
plt.show()

View File

@@ -0,0 +1,79 @@
from time import time
import matplotlib
#this will prevent the figure from popping up
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel
mandel_ker = ElementwiseKernel(
"pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
"""
mandelbrot_graph[i] = 1;
pycuda::complex<float> c = lattice[i];
pycuda::complex<float> z(0,0);
for (int j = 0; j < max_iters; j++)
{
z = z*z + c;
if(abs(z) > upper_bound)
{
mandelbrot_graph[i] = 0;
break;
}
}
""",
"mandel_ker")
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
# we set up our complex lattice as such
real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)
# copy complex lattice to the GPU
mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
# synchronize in current context
pycuda.autoinit.context.synchronize()
# allocate an empty array on the GPU
mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
pycuda.autoinit.context.synchronize()
mandelbrot_graph = mandelbrot_graph_gpu.get_async()
pycuda.autoinit.context.synchronize()
return mandelbrot_graph
if __name__ == '__main__':
t1 = time()
mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
t2 = time()
mandel_time = t2 - t1
t1 = time()
fig = plt.figure(1)
plt.imshow(mandel, extent=(-2, 2, -2, 2))
plt.savefig('mandelbrot.png', dpi=fig.dpi)
t2 = time()
dump_time = t2 - t1
print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)
print 'It took {} seconds to dump the image.'.format(dump_time)

60
Chapter05/multi-kernel.py Normal file
View File

@@ -0,0 +1,60 @@
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time
num_arrays = 200
array_len = 1024**2
ker = SourceModule("""
__global__ void mult_ker(float * array, int array_len)
{
int thd = blockIdx.x*blockDim.x + threadIdx.x;
int num_iters = array_len / blockDim.x;
for(int j=0; j < num_iters; j++)
{
int i = j * blockDim.x + thd;
for(int k = 0; k < 50; k++)
{
array[i] *= 2.0;
array[i] /= 2.0;
}
}
}
""")
mult_ker = ker.get_function('mult_ker')
data = []
data_gpu = []
gpu_out = []
# generate random arrays.
for _ in range(num_arrays):
data.append(np.random.randn(array_len).astype('float32'))
t_start = time()
# copy arrays to GPU.
for k in range(num_arrays):
data_gpu.append(gpuarray.to_gpu(data[k]))
# process arrays.
for k in range(num_arrays):
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))
# copy arrays from GPU.
for k in range(num_arrays):
gpu_out.append(data_gpu[k].get())
t_end = time()
for k in range(num_arrays):
assert (np.allclose(gpu_out[k], data[k]))
print('Total time: %f' % (t_end - t_start))

View File

@@ -0,0 +1,75 @@
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time
num_arrays = 200
array_len = 1024**2
ker = SourceModule("""
__global__ void mult_ker(float * array, int array_len)
{
int thd = blockIdx.x*blockDim.x + threadIdx.x;
int num_iters = array_len / blockDim.x;
for(int j=0; j < num_iters; j++)
{
int i = j * blockDim.x + thd;
for(int k = 0; k < 50; k++)
{
array[i] *= 2.0;
array[i] /= 2.0;
}
}
}
""")
mult_ker = ker.get_function('mult_ker')
data = []
data_gpu = []
gpu_out = []
streams = []
start_events = []
end_events = []
for _ in range(num_arrays):
streams.append(drv.Stream())
start_events.append(drv.Event())
end_events.append(drv.Event())
# generate random arrays.
for _ in range(num_arrays):
data.append(np.random.randn(array_len).astype('float32'))
t_start = time()
# copy arrays to GPU.
for k in range(num_arrays):
data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
# process arrays.
for k in range(num_arrays):
start_events[k].record(streams[k])
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
for k in range(num_arrays):
end_events[k].record(streams[k])
# copy arrays from GPU.
for k in range(num_arrays):
gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
t_end = time()
for k in range(num_arrays):
assert (np.allclose(gpu_out[k], data[k]))
kernel_times = []
for k in range(num_arrays):
kernel_times.append(start_events[k].time_till(end_events[k]))
print('Total time: %f' % (t_end - t_start))
print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times))
print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times))

View File

@@ -0,0 +1,84 @@
import pycuda
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time
import threading
num_arrays = 10
array_len = 1024**2
kernel_code = """
__global__ void mult_ker(float * array, int array_len)
{
int thd = blockIdx.x*blockDim.x + threadIdx.x;
int num_iters = array_len / blockDim.x;
for(int j=0; j < num_iters; j++)
{
int i = j * blockDim.x + thd;
for(int k = 0; k < 50; k++)
{
array[i] *= 2.0;
array[i] /= 2.0;
}
}
}
"""
class KernelLauncherThread(threading.Thread):
def __init__(self, input_array):
threading.Thread.__init__(self)
self.input_array = input_array
self.output_array = None
def run(self):
self.dev = drv.Device(0)
self.context = self.dev.make_context()
self.ker = SourceModule(kernel_code)
self.mult_ker = self.ker.get_function('mult_ker')
self.array_gpu = gpuarray.to_gpu(self.input_array)
self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
self.output_array = self.array_gpu.get()
self.context.pop()
def join(self):
threading.Thread.join(self)
return self.output_array
drv.init()
data = []
gpu_out = []
threads = []
# generate random arrays and thread objects.
for _ in range(num_arrays):
data.append(np.random.randn(array_len).astype('float32'))
for k in range(num_arrays):
# create a thread that uses data we just generated
threads.append(KernelLauncherThread(data[k]))
# launch threads to process arrays.
for k in range(num_arrays):
threads[k].start()
# get data from launched threads.
for k in range(num_arrays):
gpu_out.append(threads[k].join())
for k in range(num_arrays):
assert (np.allclose(gpu_out[k], data[k]))

View File

@@ -0,0 +1,64 @@
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time
num_arrays = 200
array_len = 1024**2
ker = SourceModule("""
__global__ void mult_ker(float * array, int array_len)
{
int thd = blockIdx.x*blockDim.x + threadIdx.x;
int num_iters = array_len / blockDim.x;
for(int j=0; j < num_iters; j++)
{
int i = j * blockDim.x + thd;
for(int k = 0; k < 50; k++)
{
array[i] *= 2.0;
array[i] /= 2.0;
}
}
}
""")
mult_ker = ker.get_function('mult_ker')
data = []
data_gpu = []
gpu_out = []
streams = []
for _ in range(num_arrays):
streams.append(drv.Stream())
# generate random arrays.
for _ in range(num_arrays):
data.append(np.random.randn(array_len).astype('float32'))
t_start = time()
# copy arrays to GPU.
for k in range(num_arrays):
data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
# process arrays.
for k in range(num_arrays):
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
# copy arrays from GPU.
for k in range(num_arrays):
gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
t_end = time()
for k in range(num_arrays):
assert (np.allclose(gpu_out[k], data[k]))
print('Total time: %f' % (t_end - t_start))

View File

@@ -0,0 +1,12 @@
import numpy as np
from pycuda import gpuarray
import pycuda.driver as drv
drv.init()
dev = drv.Device(0)
ctx = dev.make_context()
x = gpuarray.to_gpu(np.float32([1,2,3]))
print(x.get())
ctx.pop()

View File

@@ -0,0 +1,47 @@
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time
ker = SourceModule("""
__global__ void mult_ker(float * array, int array_len)
{
int thd = blockIdx.x*blockDim.x + threadIdx.x;
int num_iters = array_len / blockDim.x;
for(int j=0; j < num_iters; j++)
{
int i = j * blockDim.x + thd;
for(int k = 0; k < 50; k++)
{
array[i] *= 2.0;
array[i] /= 2.0;
}
}
}
""")
mult_ker = ker.get_function('mult_ker')
array_len = 100*1024**2
data = np.random.randn(array_len).astype('float32')
data_gpu = gpuarray.to_gpu(data)
start_event = drv.Event()
end_event = drv.Event()
start_event.record()
mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
end_event.record()
end_event.synchronize()
print('Has the kernel started yet? {}'.format(start_event.query()))
print('Has the kernel ended yet? {}'.format(end_event.query()))
print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))

View File

@@ -0,0 +1,20 @@
import threading
class PointlessExampleThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.return_value = None
def run(self):
print('Hello from the thread you just spawned!')
self.return_value = 123
def join(self):
threading.Thread.join(self)
return self.return_value
NewThread = PointlessExampleThread()
NewThread.start()
thread_output = NewThread.join()
print('The thread completed and returned this value: %s' % thread_output)