added chapter05 examples...

2025-07-21 21:01:06 +02:00 · 2020-02-18 13:07:44 -08:00
parent bf8f8502cf
commit 34f4d14daa
9 changed files with 545 additions and 0 deletions
--- a/Chapter05/conway_gpu_streams.py
+++ b/Chapter05/conway_gpu_streams.py
@@ -0,0 +1,104 @@
 # CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C
 # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 import matplotlib.pyplot as plt 
 import matplotlib.animation as animation
 ker = SourceModule("""
 #define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
 #define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
 #define _WIDTH  ( blockDim.x * gridDim.x )
 #define _HEIGHT ( blockDim.y * gridDim.y  )
 #define _XM(x)  ( (x + _WIDTH) % _WIDTH )
 #define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
 #define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
 // return the number of living neighbors for a given cell                
 __device__ int nbrs(int x, int y, int * in)
 {
     return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
                   + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
                   + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
 }
 __global__ void conway_ker(int * lattice_out, int * lattice  )
 {
   // x, y are the appropriate values for the cell covered by this thread
   int x = _X, y = _Y;
   // count the number of neighbors around the current cell
   int n = nbrs(x, y, lattice);
    // if the current cell is alive, then determine if it lives or dies for the next generation.
    if ( lattice[_INDEX(x,y)] == 1)
       switch(n)
       {
          // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
          case 2:
          case 3: lattice_out[_INDEX(x,y)] = 1;
                  break;
          default: lattice_out[_INDEX(x,y)] = 0;                   
       }
    else if( lattice[_INDEX(x,y)] == 0 )
         switch(n)
         {
            // a dead cell comes to life only if it has 3 neighbors that are alive.
            case 3: lattice_out[_INDEX(x,y)] = 1;
                    break;
            default: lattice_out[_INDEX(x,y)] = 0;         
         }
 }
 """)
 conway_ker = ker.get_function("conway_ker")
 def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent):
    for k in range(num_concurrent):
        conway_ker(  newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k]   )
        imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) )
        lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k])
    return imgs
 if __name__ == '__main__':
    # set lattice size
    N = 128
    num_concurrent = 4
    streams = []
    lattices_gpu = []
    newLattices_gpu = []
    for k in range(num_concurrent):
        streams.append(drv.Stream())
        lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
        lattices_gpu.append(gpuarray.to_gpu(lattice)) 
        newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k]))      
    fig, ax = plt.subplots(nrows=1, ncols=num_concurrent)
    imgs = []
    for k in range(num_concurrent):
        imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') )
    ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000)    
    plt.show()
--- a/Chapter05/gpu_mandelbrot_context_sync.py
+++ b/Chapter05/gpu_mandelbrot_context_sync.py
@@ -0,0 +1,79 @@
 from time import time
 import matplotlib
 #this will prevent the figure from popping up
 matplotlib.use('Agg')
 from matplotlib import pyplot as plt
 import numpy as np
 import pycuda.autoinit
 from pycuda import gpuarray
 from pycuda.elementwise import ElementwiseKernel
 mandel_ker = ElementwiseKernel(
 "pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
 """
 mandelbrot_graph[i] = 1;
 pycuda::complex<float> c = lattice[i]; 
 pycuda::complex<float> z(0,0);
 for (int j = 0; j < max_iters; j++)
    {
     z = z*z + c;
     if(abs(z) > upper_bound)
         {
          mandelbrot_graph[i] = 0;
          break;
         }
    }
 """,
 "mandel_ker")
 def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
    # we set up our complex lattice as such
    real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
    imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
    mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)    
    # copy complex lattice to the GPU
    mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
    # synchronize in current context
    pycuda.autoinit.context.synchronize()
    # allocate an empty array on the GPU
    mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
    mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
    pycuda.autoinit.context.synchronize()
    mandelbrot_graph = mandelbrot_graph_gpu.get_async()
    pycuda.autoinit.context.synchronize()
    return mandelbrot_graph
 if __name__ == '__main__':
    t1 = time()
    mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
    t2 = time()
    mandel_time = t2 - t1
    t1 = time()
    fig = plt.figure(1)
    plt.imshow(mandel, extent=(-2, 2, -2, 2))
    plt.savefig('mandelbrot.png', dpi=fig.dpi)
    t2 = time()
    dump_time = t2 - t1
    print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)
    print 'It took {} seconds to dump the image.'.format(dump_time)
--- a/Chapter05/multi-kernel.py
+++ b/Chapter05/multi-kernel.py
@@ -0,0 +1,60 @@
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 from time import time
 num_arrays = 200
 array_len = 1024**2
 ker = SourceModule("""       
 __global__ void mult_ker(float * array, int array_len)
 {
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;
         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
 }
 """)
 mult_ker = ker.get_function('mult_ker')
 data = []
 data_gpu = []
 gpu_out = []
 # generate random arrays.
 for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))
 t_start = time()
 # copy arrays to GPU.
 for k in range(num_arrays):
    data_gpu.append(gpuarray.to_gpu(data[k]))
 # process arrays.
 for k in range(num_arrays):
    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))
 # copy arrays from GPU.
 for k in range(num_arrays):
    gpu_out.append(data_gpu[k].get())
 t_end = time()
 for k in range(num_arrays):
    assert (np.allclose(gpu_out[k], data[k]))
 print('Total time: %f' % (t_end - t_start))
--- a/Chapter05/multi-kernel_events.py
+++ b/Chapter05/multi-kernel_events.py
@@ -0,0 +1,75 @@
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 from time import time
 num_arrays = 200
 array_len = 1024**2
 ker = SourceModule("""       
 __global__ void mult_ker(float * array, int array_len)
 {
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;
         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
 }
 """)
 mult_ker = ker.get_function('mult_ker')
 data = []
 data_gpu = []
 gpu_out = []
 streams = []
 start_events = []
 end_events = []
 for _ in range(num_arrays):
    streams.append(drv.Stream())
    start_events.append(drv.Event())
    end_events.append(drv.Event())
 # generate random arrays.
 for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))
 t_start = time()
 # copy arrays to GPU.
 for k in range(num_arrays):
    data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
 # process arrays.
 for k in range(num_arrays):
    start_events[k].record(streams[k])
    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
 for k in range(num_arrays):
    end_events[k].record(streams[k])
 # copy arrays from GPU.
 for k in range(num_arrays):
    gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
 t_end = time()
 for k in range(num_arrays):
    assert (np.allclose(gpu_out[k], data[k]))
 kernel_times = []
 for k in range(num_arrays):
    kernel_times.append(start_events[k].time_till(end_events[k]))
 print('Total time: %f' % (t_end - t_start))
 print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times))
 print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times))
--- a/Chapter05/multi-kernel_multi-thread.py
+++ b/Chapter05/multi-kernel_multi-thread.py
@@ -0,0 +1,84 @@
 import pycuda
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 from time import time
 import threading 
 num_arrays = 10
 array_len = 1024**2
 kernel_code = """       
 __global__ void mult_ker(float * array, int array_len)
 {
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;
         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
 }
 """
 class KernelLauncherThread(threading.Thread):
    def __init__(self, input_array):
        threading.Thread.__init__(self)
        self.input_array = input_array
        self.output_array = None
    def run(self):
        self.dev = drv.Device(0)
        self.context = self.dev.make_context()
        self.ker = SourceModule(kernel_code)
        self.mult_ker = self.ker.get_function('mult_ker')
        self.array_gpu = gpuarray.to_gpu(self.input_array)
        self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
        self.output_array = self.array_gpu.get()
        self.context.pop()
    def join(self):
        threading.Thread.join(self)
        return self.output_array
 drv.init()
 data = []
 gpu_out = []
 threads = []
 # generate random arrays and thread objects.
 for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))
 for k in range(num_arrays):
    # create a thread that uses data we just generated
    threads.append(KernelLauncherThread(data[k]))
 # launch threads to process arrays.
 for k in range(num_arrays):
    threads[k].start()
 # get data from launched threads.
 for k in range(num_arrays):
    gpu_out.append(threads[k].join())
 for k in range(num_arrays):
    assert (np.allclose(gpu_out[k], data[k]))
--- a/Chapter05/multi-kernel_streams.py
+++ b/Chapter05/multi-kernel_streams.py
@@ -0,0 +1,64 @@
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 from time import time
 num_arrays = 200
 array_len = 1024**2
 ker = SourceModule("""       
 __global__ void mult_ker(float * array, int array_len)
 {
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;
         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
 }
 """)
 mult_ker = ker.get_function('mult_ker')
 data = []
 data_gpu = []
 gpu_out = []
 streams = []
 for _ in range(num_arrays):
    streams.append(drv.Stream())
 # generate random arrays.
 for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))
 t_start = time()
 # copy arrays to GPU.
 for k in range(num_arrays):
    data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
 # process arrays.
 for k in range(num_arrays):
    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
 # copy arrays from GPU.
 for k in range(num_arrays):
    gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
 t_end = time()
 for k in range(num_arrays):
    assert (np.allclose(gpu_out[k], data[k]))
 print('Total time: %f' % (t_end - t_start))
--- a/Chapter05/simple_context_create.py
+++ b/Chapter05/simple_context_create.py
@@ -0,0 +1,12 @@
 import numpy as np
 from pycuda import gpuarray
 import pycuda.driver as drv
 drv.init()
 dev = drv.Device(0)
 ctx = dev.make_context()
 x = gpuarray.to_gpu(np.float32([1,2,3]))
 print(x.get())
 ctx.pop()
--- a/Chapter05/simple_event_example.py
+++ b/Chapter05/simple_event_example.py
@@ -0,0 +1,47 @@
 import pycuda.autoinit
 import pycuda.driver as drv
 from pycuda import gpuarray
 from pycuda.compiler import SourceModule
 import numpy as np
 from time import time
 ker = SourceModule("""       
 __global__ void mult_ker(float * array, int array_len)
 {
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;
         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
 }
 """)
 mult_ker = ker.get_function('mult_ker')
 array_len = 100*1024**2
 data = np.random.randn(array_len).astype('float32')
 data_gpu = gpuarray.to_gpu(data)
 start_event = drv.Event()
 end_event = drv.Event()
 start_event.record()
 mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
 end_event.record()
 end_event.synchronize()
 print('Has the kernel started yet? {}'.format(start_event.query()))
 print('Has the kernel ended yet? {}'.format(end_event.query()))
 print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))
--- a/Chapter05/single_thread_example.py
+++ b/Chapter05/single_thread_example.py
@@ -0,0 +1,20 @@
 import threading
 class PointlessExampleThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.return_value = None
    def run(self):
        print('Hello from the thread you just spawned!')
        self.return_value = 123
    def join(self):
        threading.Thread.join(self)
        return self.return_value
 NewThread = PointlessExampleThread()
 NewThread.start()
 thread_output = NewThread.join()
 print('The thread completed and returned this value: %s' % thread_output)