mirror of
https://github.com/PacktPublishing/Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition.git
synced 2025-07-21 21:01:06 +02:00
added chapter05 examples...
This commit is contained in:
104
Chapter05/conway_gpu_streams.py
Normal file
104
Chapter05/conway_gpu_streams.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C
|
||||||
|
# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
|
||||||
|
|
||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.animation as animation
|
||||||
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
#define _X ( threadIdx.x + blockIdx.x * blockDim.x )
|
||||||
|
#define _Y ( threadIdx.y + blockIdx.y * blockDim.y )
|
||||||
|
|
||||||
|
#define _WIDTH ( blockDim.x * gridDim.x )
|
||||||
|
#define _HEIGHT ( blockDim.y * gridDim.y )
|
||||||
|
|
||||||
|
#define _XM(x) ( (x + _WIDTH) % _WIDTH )
|
||||||
|
#define _YM(y) ( (y + _HEIGHT) % _HEIGHT )
|
||||||
|
|
||||||
|
#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH )
|
||||||
|
|
||||||
|
// return the number of living neighbors for a given cell
|
||||||
|
__device__ int nbrs(int x, int y, int * in)
|
||||||
|
{
|
||||||
|
return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
|
||||||
|
+ in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
|
||||||
|
+ in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void conway_ker(int * lattice_out, int * lattice )
|
||||||
|
{
|
||||||
|
// x, y are the appropriate values for the cell covered by this thread
|
||||||
|
int x = _X, y = _Y;
|
||||||
|
|
||||||
|
// count the number of neighbors around the current cell
|
||||||
|
int n = nbrs(x, y, lattice);
|
||||||
|
|
||||||
|
|
||||||
|
// if the current cell is alive, then determine if it lives or dies for the next generation.
|
||||||
|
if ( lattice[_INDEX(x,y)] == 1)
|
||||||
|
switch(n)
|
||||||
|
{
|
||||||
|
// if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
|
||||||
|
case 2:
|
||||||
|
case 3: lattice_out[_INDEX(x,y)] = 1;
|
||||||
|
break;
|
||||||
|
default: lattice_out[_INDEX(x,y)] = 0;
|
||||||
|
}
|
||||||
|
else if( lattice[_INDEX(x,y)] == 0 )
|
||||||
|
switch(n)
|
||||||
|
{
|
||||||
|
// a dead cell comes to life only if it has 3 neighbors that are alive.
|
||||||
|
case 3: lattice_out[_INDEX(x,y)] = 1;
|
||||||
|
break;
|
||||||
|
default: lattice_out[_INDEX(x,y)] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
conway_ker = ker.get_function("conway_ker")
|
||||||
|
|
||||||
|
|
||||||
|
def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent):
|
||||||
|
|
||||||
|
for k in range(num_concurrent):
|
||||||
|
conway_ker( newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k] )
|
||||||
|
|
||||||
|
imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) )
|
||||||
|
|
||||||
|
lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k])
|
||||||
|
|
||||||
|
|
||||||
|
return imgs
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# set lattice size
|
||||||
|
N = 128
|
||||||
|
|
||||||
|
num_concurrent = 4
|
||||||
|
|
||||||
|
streams = []
|
||||||
|
lattices_gpu = []
|
||||||
|
newLattices_gpu = []
|
||||||
|
|
||||||
|
for k in range(num_concurrent):
|
||||||
|
streams.append(drv.Stream())
|
||||||
|
lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
|
||||||
|
lattices_gpu.append(gpuarray.to_gpu(lattice))
|
||||||
|
newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k]))
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(nrows=1, ncols=num_concurrent)
|
||||||
|
imgs = []
|
||||||
|
|
||||||
|
for k in range(num_concurrent):
|
||||||
|
imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') )
|
||||||
|
|
||||||
|
ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000)
|
||||||
|
|
||||||
|
plt.show()
|
79
Chapter05/gpu_mandelbrot_context_sync.py
Normal file
79
Chapter05/gpu_mandelbrot_context_sync.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
from time import time
|
||||||
|
import matplotlib
|
||||||
|
#this will prevent the figure from popping up
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pycuda.autoinit
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.elementwise import ElementwiseKernel
|
||||||
|
|
||||||
|
mandel_ker = ElementwiseKernel(
|
||||||
|
"pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
|
||||||
|
"""
|
||||||
|
mandelbrot_graph[i] = 1;
|
||||||
|
|
||||||
|
pycuda::complex<float> c = lattice[i];
|
||||||
|
pycuda::complex<float> z(0,0);
|
||||||
|
|
||||||
|
for (int j = 0; j < max_iters; j++)
|
||||||
|
{
|
||||||
|
|
||||||
|
z = z*z + c;
|
||||||
|
|
||||||
|
if(abs(z) > upper_bound)
|
||||||
|
{
|
||||||
|
mandelbrot_graph[i] = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
""",
|
||||||
|
"mandel_ker")
|
||||||
|
|
||||||
|
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
|
||||||
|
|
||||||
|
# we set up our complex lattice as such
|
||||||
|
real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
|
||||||
|
imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
|
||||||
|
mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)
|
||||||
|
|
||||||
|
# copy complex lattice to the GPU
|
||||||
|
mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
|
||||||
|
|
||||||
|
# synchronize in current context
|
||||||
|
pycuda.autoinit.context.synchronize()
|
||||||
|
|
||||||
|
# allocate an empty array on the GPU
|
||||||
|
mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
|
||||||
|
|
||||||
|
mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
|
||||||
|
|
||||||
|
pycuda.autoinit.context.synchronize()
|
||||||
|
|
||||||
|
mandelbrot_graph = mandelbrot_graph_gpu.get_async()
|
||||||
|
|
||||||
|
pycuda.autoinit.context.synchronize()
|
||||||
|
|
||||||
|
return mandelbrot_graph
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
t1 = time()
|
||||||
|
mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
|
||||||
|
t2 = time()
|
||||||
|
|
||||||
|
mandel_time = t2 - t1
|
||||||
|
|
||||||
|
t1 = time()
|
||||||
|
fig = plt.figure(1)
|
||||||
|
plt.imshow(mandel, extent=(-2, 2, -2, 2))
|
||||||
|
plt.savefig('mandelbrot.png', dpi=fig.dpi)
|
||||||
|
t2 = time()
|
||||||
|
|
||||||
|
dump_time = t2 - t1
|
||||||
|
|
||||||
|
print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)
|
||||||
|
print 'It took {} seconds to dump the image.'.format(dump_time)
|
60
Chapter05/multi-kernel.py
Normal file
60
Chapter05/multi-kernel.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
num_arrays = 200
|
||||||
|
array_len = 1024**2
|
||||||
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
__global__ void mult_ker(float * array, int array_len)
|
||||||
|
{
|
||||||
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
int num_iters = array_len / blockDim.x;
|
||||||
|
|
||||||
|
for(int j=0; j < num_iters; j++)
|
||||||
|
{
|
||||||
|
int i = j * blockDim.x + thd;
|
||||||
|
|
||||||
|
for(int k = 0; k < 50; k++)
|
||||||
|
{
|
||||||
|
array[i] *= 2.0;
|
||||||
|
array[i] /= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
mult_ker = ker.get_function('mult_ker')
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data_gpu = []
|
||||||
|
gpu_out = []
|
||||||
|
|
||||||
|
# generate random arrays.
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
data.append(np.random.randn(array_len).astype('float32'))
|
||||||
|
|
||||||
|
t_start = time()
|
||||||
|
|
||||||
|
# copy arrays to GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
data_gpu.append(gpuarray.to_gpu(data[k]))
|
||||||
|
|
||||||
|
# process arrays.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))
|
||||||
|
|
||||||
|
# copy arrays from GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
gpu_out.append(data_gpu[k].get())
|
||||||
|
|
||||||
|
t_end = time()
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
assert (np.allclose(gpu_out[k], data[k]))
|
||||||
|
|
||||||
|
print('Total time: %f' % (t_end - t_start))
|
75
Chapter05/multi-kernel_events.py
Normal file
75
Chapter05/multi-kernel_events.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
num_arrays = 200
|
||||||
|
array_len = 1024**2
|
||||||
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
__global__ void mult_ker(float * array, int array_len)
|
||||||
|
{
|
||||||
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
int num_iters = array_len / blockDim.x;
|
||||||
|
for(int j=0; j < num_iters; j++)
|
||||||
|
{
|
||||||
|
int i = j * blockDim.x + thd;
|
||||||
|
for(int k = 0; k < 50; k++)
|
||||||
|
{
|
||||||
|
array[i] *= 2.0;
|
||||||
|
array[i] /= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
mult_ker = ker.get_function('mult_ker')
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data_gpu = []
|
||||||
|
gpu_out = []
|
||||||
|
streams = []
|
||||||
|
start_events = []
|
||||||
|
end_events = []
|
||||||
|
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
streams.append(drv.Stream())
|
||||||
|
start_events.append(drv.Event())
|
||||||
|
end_events.append(drv.Event())
|
||||||
|
|
||||||
|
# generate random arrays.
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
data.append(np.random.randn(array_len).astype('float32'))
|
||||||
|
|
||||||
|
t_start = time()
|
||||||
|
|
||||||
|
# copy arrays to GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
|
||||||
|
|
||||||
|
# process arrays.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
start_events[k].record(streams[k])
|
||||||
|
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
|
||||||
|
for k in range(num_arrays):
|
||||||
|
end_events[k].record(streams[k])
|
||||||
|
|
||||||
|
# copy arrays from GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
|
||||||
|
|
||||||
|
t_end = time()
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
assert (np.allclose(gpu_out[k], data[k]))
|
||||||
|
|
||||||
|
kernel_times = []
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
kernel_times.append(start_events[k].time_till(end_events[k]))
|
||||||
|
|
||||||
|
print('Total time: %f' % (t_end - t_start))
|
||||||
|
print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times))
|
||||||
|
print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times))
|
84
Chapter05/multi-kernel_multi-thread.py
Normal file
84
Chapter05/multi-kernel_multi-thread.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import pycuda
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
from time import time
|
||||||
|
import threading
|
||||||
|
|
||||||
|
|
||||||
|
num_arrays = 10
|
||||||
|
array_len = 1024**2
|
||||||
|
|
||||||
|
kernel_code = """
|
||||||
|
__global__ void mult_ker(float * array, int array_len)
|
||||||
|
{
|
||||||
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
int num_iters = array_len / blockDim.x;
|
||||||
|
|
||||||
|
for(int j=0; j < num_iters; j++)
|
||||||
|
{
|
||||||
|
int i = j * blockDim.x + thd;
|
||||||
|
|
||||||
|
for(int k = 0; k < 50; k++)
|
||||||
|
{
|
||||||
|
array[i] *= 2.0;
|
||||||
|
array[i] /= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
class KernelLauncherThread(threading.Thread):
|
||||||
|
def __init__(self, input_array):
|
||||||
|
threading.Thread.__init__(self)
|
||||||
|
self.input_array = input_array
|
||||||
|
self.output_array = None
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.dev = drv.Device(0)
|
||||||
|
self.context = self.dev.make_context()
|
||||||
|
|
||||||
|
self.ker = SourceModule(kernel_code)
|
||||||
|
|
||||||
|
self.mult_ker = self.ker.get_function('mult_ker')
|
||||||
|
|
||||||
|
self.array_gpu = gpuarray.to_gpu(self.input_array)
|
||||||
|
|
||||||
|
self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
|
||||||
|
|
||||||
|
self.output_array = self.array_gpu.get()
|
||||||
|
|
||||||
|
self.context.pop()
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
threading.Thread.join(self)
|
||||||
|
return self.output_array
|
||||||
|
|
||||||
|
drv.init()
|
||||||
|
|
||||||
|
|
||||||
|
data = []
|
||||||
|
gpu_out = []
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
# generate random arrays and thread objects.
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
data.append(np.random.randn(array_len).astype('float32'))
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
# create a thread that uses data we just generated
|
||||||
|
threads.append(KernelLauncherThread(data[k]))
|
||||||
|
|
||||||
|
# launch threads to process arrays.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
threads[k].start()
|
||||||
|
|
||||||
|
# get data from launched threads.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
gpu_out.append(threads[k].join())
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
assert (np.allclose(gpu_out[k], data[k]))
|
||||||
|
|
64
Chapter05/multi-kernel_streams.py
Normal file
64
Chapter05/multi-kernel_streams.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
num_arrays = 200
|
||||||
|
array_len = 1024**2
|
||||||
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
__global__ void mult_ker(float * array, int array_len)
|
||||||
|
{
|
||||||
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
int num_iters = array_len / blockDim.x;
|
||||||
|
|
||||||
|
for(int j=0; j < num_iters; j++)
|
||||||
|
{
|
||||||
|
int i = j * blockDim.x + thd;
|
||||||
|
|
||||||
|
for(int k = 0; k < 50; k++)
|
||||||
|
{
|
||||||
|
array[i] *= 2.0;
|
||||||
|
array[i] /= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
mult_ker = ker.get_function('mult_ker')
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data_gpu = []
|
||||||
|
gpu_out = []
|
||||||
|
streams = []
|
||||||
|
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
streams.append(drv.Stream())
|
||||||
|
|
||||||
|
# generate random arrays.
|
||||||
|
for _ in range(num_arrays):
|
||||||
|
data.append(np.random.randn(array_len).astype('float32'))
|
||||||
|
|
||||||
|
t_start = time()
|
||||||
|
|
||||||
|
# copy arrays to GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
|
||||||
|
|
||||||
|
# process arrays.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
|
||||||
|
|
||||||
|
# copy arrays from GPU.
|
||||||
|
for k in range(num_arrays):
|
||||||
|
gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
|
||||||
|
|
||||||
|
t_end = time()
|
||||||
|
|
||||||
|
for k in range(num_arrays):
|
||||||
|
assert (np.allclose(gpu_out[k], data[k]))
|
||||||
|
|
||||||
|
print('Total time: %f' % (t_end - t_start))
|
12
Chapter05/simple_context_create.py
Normal file
12
Chapter05/simple_context_create.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
import numpy as np
|
||||||
|
from pycuda import gpuarray
|
||||||
|
import pycuda.driver as drv
|
||||||
|
|
||||||
|
drv.init()
|
||||||
|
dev = drv.Device(0)
|
||||||
|
ctx = dev.make_context()
|
||||||
|
|
||||||
|
x = gpuarray.to_gpu(np.float32([1,2,3]))
|
||||||
|
print(x.get())
|
||||||
|
|
||||||
|
ctx.pop()
|
47
Chapter05/simple_event_example.py
Normal file
47
Chapter05/simple_event_example.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
import numpy as np
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
__global__ void mult_ker(float * array, int array_len)
|
||||||
|
{
|
||||||
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
int num_iters = array_len / blockDim.x;
|
||||||
|
|
||||||
|
for(int j=0; j < num_iters; j++)
|
||||||
|
{
|
||||||
|
int i = j * blockDim.x + thd;
|
||||||
|
|
||||||
|
for(int k = 0; k < 50; k++)
|
||||||
|
{
|
||||||
|
array[i] *= 2.0;
|
||||||
|
array[i] /= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
mult_ker = ker.get_function('mult_ker')
|
||||||
|
|
||||||
|
array_len = 100*1024**2
|
||||||
|
|
||||||
|
data = np.random.randn(array_len).astype('float32')
|
||||||
|
data_gpu = gpuarray.to_gpu(data)
|
||||||
|
|
||||||
|
start_event = drv.Event()
|
||||||
|
end_event = drv.Event()
|
||||||
|
|
||||||
|
start_event.record()
|
||||||
|
mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
|
||||||
|
end_event.record()
|
||||||
|
|
||||||
|
end_event.synchronize()
|
||||||
|
|
||||||
|
print('Has the kernel started yet? {}'.format(start_event.query()))
|
||||||
|
print('Has the kernel ended yet? {}'.format(end_event.query()))
|
||||||
|
|
||||||
|
print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))
|
||||||
|
|
20
Chapter05/single_thread_example.py
Normal file
20
Chapter05/single_thread_example.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import threading
|
||||||
|
|
||||||
|
class PointlessExampleThread(threading.Thread):
|
||||||
|
def __init__(self):
|
||||||
|
threading.Thread.__init__(self)
|
||||||
|
self.return_value = None
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
print('Hello from the thread you just spawned!')
|
||||||
|
self.return_value = 123
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
threading.Thread.join(self)
|
||||||
|
return self.return_value
|
||||||
|
|
||||||
|
|
||||||
|
NewThread = PointlessExampleThread()
|
||||||
|
NewThread.start()
|
||||||
|
thread_output = NewThread.join()
|
||||||
|
print('The thread completed and returned this value: %s' % thread_output)
|
Reference in New Issue
Block a user