From 34f4d14daa0f8d21fb45baa7a6b0a587045a78e2 Mon Sep 17 00:00:00 2001
From: mp <btuomanen@outlook.com>
Date: Tue, 18 Feb 2020 13:07:44 -0800
Subject: [PATCH] added chapter05 examples...

---
 Chapter05/conway_gpu_streams.py          | 104 +++++++++++++++++++++++
 Chapter05/gpu_mandelbrot_context_sync.py |  79 +++++++++++++++++
 Chapter05/multi-kernel.py                |  60 +++++++++++++
 Chapter05/multi-kernel_events.py         |  75 ++++++++++++++++
 Chapter05/multi-kernel_multi-thread.py   |  84 ++++++++++++++++++
 Chapter05/multi-kernel_streams.py        |  64 ++++++++++++++
 Chapter05/simple_context_create.py       |  12 +++
 Chapter05/simple_event_example.py        |  47 ++++++++++
 Chapter05/single_thread_example.py       |  20 +++++
 9 files changed, 545 insertions(+)
 create mode 100644 Chapter05/conway_gpu_streams.py
 create mode 100644 Chapter05/gpu_mandelbrot_context_sync.py
 create mode 100644 Chapter05/multi-kernel.py
 create mode 100644 Chapter05/multi-kernel_events.py
 create mode 100644 Chapter05/multi-kernel_multi-thread.py
 create mode 100644 Chapter05/multi-kernel_streams.py
 create mode 100644 Chapter05/simple_context_create.py
 create mode 100644 Chapter05/simple_event_example.py
 create mode 100644 Chapter05/single_thread_example.py

diff --git a/Chapter05/conway_gpu_streams.py b/Chapter05/conway_gpu_streams.py
new file mode 100644
index 0000000..0e16925
--- /dev/null
+++ b/Chapter05/conway_gpu_streams.py
@@ -0,0 +1,104 @@
+# CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C
+# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
+
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+import matplotlib.pyplot as plt 
+import matplotlib.animation as animation
+
+ker = SourceModule("""
+#define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
+#define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
+
+#define _WIDTH  ( blockDim.x * gridDim.x )
+#define _HEIGHT ( blockDim.y * gridDim.y  )
+
+#define _XM(x)  ( (x + _WIDTH) % _WIDTH )
+#define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
+
+#define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
+
+// return the number of living neighbors for a given cell                
+__device__ int nbrs(int x, int y, int * in)
+{
+     return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
+                   + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
+                   + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
+}
+
+__global__ void conway_ker(int * lattice_out, int * lattice  )
+{
+   // x, y are the appropriate values for the cell covered by this thread
+   int x = _X, y = _Y;
+   
+   // count the number of neighbors around the current cell
+   int n = nbrs(x, y, lattice);
+                   
+    
+    // if the current cell is alive, then determine if it lives or dies for the next generation.
+    if ( lattice[_INDEX(x,y)] == 1)
+       switch(n)
+       {
+          // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
+          case 2:
+          case 3: lattice_out[_INDEX(x,y)] = 1;
+                  break;
+          default: lattice_out[_INDEX(x,y)] = 0;                   
+       }
+    else if( lattice[_INDEX(x,y)] == 0 )
+         switch(n)
+         {
+            // a dead cell comes to life only if it has 3 neighbors that are alive.
+            case 3: lattice_out[_INDEX(x,y)] = 1;
+                    break;
+            default: lattice_out[_INDEX(x,y)] = 0;         
+         }
+         
+}
+""")
+
+
+conway_ker = ker.get_function("conway_ker")
+    
+
+def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent):
+    
+    for k in range(num_concurrent):
+        conway_ker(  newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k]   )
+        
+        imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) )
+        
+        lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k])
+        
+    
+    return imgs
+    
+
+if __name__ == '__main__':
+    # set lattice size
+    N = 128
+    
+    num_concurrent = 4
+    
+    streams = []
+    lattices_gpu = []
+    newLattices_gpu = []
+    
+    for k in range(num_concurrent):
+        streams.append(drv.Stream())
+        lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
+        lattices_gpu.append(gpuarray.to_gpu(lattice)) 
+        newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k]))      
+
+    fig, ax = plt.subplots(nrows=1, ncols=num_concurrent)
+    imgs = []
+    
+    for k in range(num_concurrent):
+        imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') )
+    
+    ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000)    
+     
+    plt.show()
diff --git a/Chapter05/gpu_mandelbrot_context_sync.py b/Chapter05/gpu_mandelbrot_context_sync.py
new file mode 100644
index 0000000..d1688c0
--- /dev/null
+++ b/Chapter05/gpu_mandelbrot_context_sync.py
@@ -0,0 +1,79 @@
+from time import time
+import matplotlib
+#this will prevent the figure from popping up
+matplotlib.use('Agg')
+from matplotlib import pyplot as plt
+import numpy as np
+import pycuda.autoinit
+from pycuda import gpuarray
+from pycuda.elementwise import ElementwiseKernel
+
+mandel_ker = ElementwiseKernel(
+"pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
+"""
+mandelbrot_graph[i] = 1;
+
+pycuda::complex<float> c = lattice[i]; 
+pycuda::complex<float> z(0,0);
+
+for (int j = 0; j < max_iters; j++)
+    {
+    
+     z = z*z + c;
+     
+     if(abs(z) > upper_bound)
+         {
+          mandelbrot_graph[i] = 0;
+          break;
+         }
+
+    }
+         
+""",
+"mandel_ker")
+
+def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
+
+    # we set up our complex lattice as such
+    real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
+    imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
+    mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)    
+    
+    # copy complex lattice to the GPU
+    mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
+
+    # synchronize in current context
+    pycuda.autoinit.context.synchronize()
+
+    # allocate an empty array on the GPU
+    mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
+
+    mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
+
+    pycuda.autoinit.context.synchronize()
+              
+    mandelbrot_graph = mandelbrot_graph_gpu.get_async()
+    
+    pycuda.autoinit.context.synchronize()
+
+    return mandelbrot_graph
+
+
+if __name__ == '__main__':
+
+    t1 = time()
+    mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
+    t2 = time()
+
+    mandel_time = t2 - t1
+
+    t1 = time()
+    fig = plt.figure(1)
+    plt.imshow(mandel, extent=(-2, 2, -2, 2))
+    plt.savefig('mandelbrot.png', dpi=fig.dpi)
+    t2 = time()
+
+    dump_time = t2 - t1
+
+    print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)
+    print 'It took {} seconds to dump the image.'.format(dump_time)
diff --git a/Chapter05/multi-kernel.py b/Chapter05/multi-kernel.py
new file mode 100644
index 0000000..e3dfc98
--- /dev/null
+++ b/Chapter05/multi-kernel.py
@@ -0,0 +1,60 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+from time import time
+
+num_arrays = 200
+array_len = 1024**2
+
+ker = SourceModule("""       
+__global__ void mult_ker(float * array, int array_len)
+{
+     int thd = blockIdx.x*blockDim.x + threadIdx.x;
+     int num_iters = array_len / blockDim.x;
+
+     for(int j=0; j < num_iters; j++)
+     {
+         int i = j * blockDim.x + thd;
+
+         for(int k = 0; k < 50; k++)
+         {
+              array[i] *= 2.0;
+              array[i] /= 2.0;
+         }
+     }
+
+}
+""")
+
+mult_ker = ker.get_function('mult_ker')
+
+data = []
+data_gpu = []
+gpu_out = []
+
+# generate random arrays.
+for _ in range(num_arrays):
+    data.append(np.random.randn(array_len).astype('float32'))
+
+t_start = time()
+
+# copy arrays to GPU.
+for k in range(num_arrays):
+    data_gpu.append(gpuarray.to_gpu(data[k]))
+
+# process arrays.
+for k in range(num_arrays):
+    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))
+
+# copy arrays from GPU.
+for k in range(num_arrays):
+    gpu_out.append(data_gpu[k].get())
+
+t_end = time()
+
+for k in range(num_arrays):
+    assert (np.allclose(gpu_out[k], data[k]))
+
+print('Total time: %f' % (t_end - t_start))
diff --git a/Chapter05/multi-kernel_events.py b/Chapter05/multi-kernel_events.py
new file mode 100644
index 0000000..8d60e00
--- /dev/null
+++ b/Chapter05/multi-kernel_events.py
@@ -0,0 +1,75 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+from time import time
+
+num_arrays = 200
+array_len = 1024**2
+
+ker = SourceModule("""       
+__global__ void mult_ker(float * array, int array_len)
+{
+     int thd = blockIdx.x*blockDim.x + threadIdx.x;
+     int num_iters = array_len / blockDim.x;
+     for(int j=0; j < num_iters; j++)
+     {
+         int i = j * blockDim.x + thd;
+         for(int k = 0; k < 50; k++)
+         {
+              array[i] *= 2.0;
+              array[i] /= 2.0;
+         }
+     }
+}
+""")
+
+mult_ker = ker.get_function('mult_ker')
+
+data = []
+data_gpu = []
+gpu_out = []
+streams = []
+start_events = []
+end_events = []
+
+for _ in range(num_arrays):
+    streams.append(drv.Stream())
+    start_events.append(drv.Event())
+    end_events.append(drv.Event())
+
+# generate random arrays.
+for _ in range(num_arrays):
+    data.append(np.random.randn(array_len).astype('float32'))
+
+t_start = time()
+
+# copy arrays to GPU.
+for k in range(num_arrays):
+    data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
+
+# process arrays.
+for k in range(num_arrays):
+    start_events[k].record(streams[k])
+    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
+for k in range(num_arrays):
+    end_events[k].record(streams[k])
+    
+# copy arrays from GPU.
+for k in range(num_arrays):
+    gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
+
+t_end = time()
+
+for k in range(num_arrays):
+    assert (np.allclose(gpu_out[k], data[k]))
+
+kernel_times = []
+
+for k in range(num_arrays):
+    kernel_times.append(start_events[k].time_till(end_events[k]))
+
+print('Total time: %f' % (t_end - t_start))
+print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times))
+print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times))
diff --git a/Chapter05/multi-kernel_multi-thread.py b/Chapter05/multi-kernel_multi-thread.py
new file mode 100644
index 0000000..dcf745d
--- /dev/null
+++ b/Chapter05/multi-kernel_multi-thread.py
@@ -0,0 +1,84 @@
+import pycuda
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+from time import time
+import threading 
+
+
+num_arrays = 10
+array_len = 1024**2
+
+kernel_code = """       
+__global__ void mult_ker(float * array, int array_len)
+{
+     int thd = blockIdx.x*blockDim.x + threadIdx.x;
+     int num_iters = array_len / blockDim.x;
+
+     for(int j=0; j < num_iters; j++)
+     {
+         int i = j * blockDim.x + thd;
+
+         for(int k = 0; k < 50; k++)
+         {
+              array[i] *= 2.0;
+              array[i] /= 2.0;
+         }
+     }
+ 
+}
+"""
+
+class KernelLauncherThread(threading.Thread):
+    def __init__(self, input_array):
+        threading.Thread.__init__(self)
+        self.input_array = input_array
+        self.output_array = None
+  
+    def run(self):
+        self.dev = drv.Device(0)
+        self.context = self.dev.make_context()
+        
+        self.ker = SourceModule(kernel_code)
+
+        self.mult_ker = self.ker.get_function('mult_ker')
+
+        self.array_gpu = gpuarray.to_gpu(self.input_array)
+        
+        self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
+
+        self.output_array = self.array_gpu.get()
+        
+        self.context.pop()
+        
+    def join(self):
+        threading.Thread.join(self)
+        return self.output_array
+
+drv.init()
+
+
+data = []
+gpu_out = []
+threads = []
+
+# generate random arrays and thread objects.
+for _ in range(num_arrays):
+    data.append(np.random.randn(array_len).astype('float32'))
+
+for k in range(num_arrays):
+    # create a thread that uses data we just generated
+    threads.append(KernelLauncherThread(data[k]))
+
+# launch threads to process arrays.
+for k in range(num_arrays):
+    threads[k].start()
+    
+# get data from launched threads.
+for k in range(num_arrays):
+    gpu_out.append(threads[k].join())
+
+for k in range(num_arrays):
+    assert (np.allclose(gpu_out[k], data[k]))
+
diff --git a/Chapter05/multi-kernel_streams.py b/Chapter05/multi-kernel_streams.py
new file mode 100644
index 0000000..146c334
--- /dev/null
+++ b/Chapter05/multi-kernel_streams.py
@@ -0,0 +1,64 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+from time import time
+
+num_arrays = 200
+array_len = 1024**2
+
+ker = SourceModule("""       
+__global__ void mult_ker(float * array, int array_len)
+{
+     int thd = blockIdx.x*blockDim.x + threadIdx.x;
+     int num_iters = array_len / blockDim.x;
+
+     for(int j=0; j < num_iters; j++)
+     {
+         int i = j * blockDim.x + thd;
+
+         for(int k = 0; k < 50; k++)
+         {
+              array[i] *= 2.0;
+              array[i] /= 2.0;
+         }
+     }
+
+}
+""")
+
+mult_ker = ker.get_function('mult_ker')
+
+data = []
+data_gpu = []
+gpu_out = []
+streams = []
+
+for _ in range(num_arrays):
+    streams.append(drv.Stream())
+
+# generate random arrays.
+for _ in range(num_arrays):
+    data.append(np.random.randn(array_len).astype('float32'))
+
+t_start = time()
+
+# copy arrays to GPU.
+for k in range(num_arrays):
+    data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
+
+# process arrays.
+for k in range(num_arrays):
+    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
+
+# copy arrays from GPU.
+for k in range(num_arrays):
+    gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
+
+t_end = time()
+
+for k in range(num_arrays):
+    assert (np.allclose(gpu_out[k], data[k]))
+
+print('Total time: %f' % (t_end - t_start))
diff --git a/Chapter05/simple_context_create.py b/Chapter05/simple_context_create.py
new file mode 100644
index 0000000..2b99316
--- /dev/null
+++ b/Chapter05/simple_context_create.py
@@ -0,0 +1,12 @@
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as drv
+
+drv.init()
+dev = drv.Device(0)
+ctx = dev.make_context()
+
+x = gpuarray.to_gpu(np.float32([1,2,3]))
+print(x.get())
+
+ctx.pop()
diff --git a/Chapter05/simple_event_example.py b/Chapter05/simple_event_example.py
new file mode 100644
index 0000000..5060e9b
--- /dev/null
+++ b/Chapter05/simple_event_example.py
@@ -0,0 +1,47 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+from time import time
+
+ker = SourceModule("""       
+__global__ void mult_ker(float * array, int array_len)
+{
+     int thd = blockIdx.x*blockDim.x + threadIdx.x;
+     int num_iters = array_len / blockDim.x;
+
+     for(int j=0; j < num_iters; j++)
+     {
+         int i = j * blockDim.x + thd;
+
+         for(int k = 0; k < 50; k++)
+         {
+              array[i] *= 2.0;
+              array[i] /= 2.0;
+         }
+     }
+}
+""")
+
+mult_ker = ker.get_function('mult_ker')
+
+array_len = 100*1024**2
+
+data = np.random.randn(array_len).astype('float32')
+data_gpu = gpuarray.to_gpu(data)
+
+start_event = drv.Event()
+end_event = drv.Event()
+
+start_event.record()
+mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
+end_event.record()
+
+end_event.synchronize()
+
+print('Has the kernel started yet? {}'.format(start_event.query()))
+print('Has the kernel ended yet? {}'.format(end_event.query()))
+
+print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))
+
diff --git a/Chapter05/single_thread_example.py b/Chapter05/single_thread_example.py
new file mode 100644
index 0000000..976ba8d
--- /dev/null
+++ b/Chapter05/single_thread_example.py
@@ -0,0 +1,20 @@
+import threading
+
+class PointlessExampleThread(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+        self.return_value = None
+        
+    def run(self):
+        print('Hello from the thread you just spawned!')
+        self.return_value = 123
+        
+    def join(self):
+        threading.Thread.join(self)
+        return self.return_value
+    
+
+NewThread = PointlessExampleThread()
+NewThread.start()
+thread_output = NewThread.join()
+print('The thread completed and returned this value: %s' % thread_output)