mirror of
https://github.com/PacktPublishing/Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition.git
synced 2025-07-21 12:51:06 +02:00
85 lines
1.9 KiB
Python
85 lines
1.9 KiB
Python
import pycuda
|
|
import pycuda.driver as drv
|
|
from pycuda import gpuarray
|
|
from pycuda.compiler import SourceModule
|
|
import numpy as np
|
|
from time import time
|
|
import threading
|
|
|
|
|
|
num_arrays = 10
|
|
array_len = 1024**2
|
|
|
|
kernel_code = """
|
|
__global__ void mult_ker(float * array, int array_len)
|
|
{
|
|
int thd = blockIdx.x*blockDim.x + threadIdx.x;
|
|
int num_iters = array_len / blockDim.x;
|
|
|
|
for(int j=0; j < num_iters; j++)
|
|
{
|
|
int i = j * blockDim.x + thd;
|
|
|
|
for(int k = 0; k < 50; k++)
|
|
{
|
|
array[i] *= 2.0;
|
|
array[i] /= 2.0;
|
|
}
|
|
}
|
|
|
|
}
|
|
"""
|
|
|
|
class KernelLauncherThread(threading.Thread):
|
|
def __init__(self, input_array):
|
|
threading.Thread.__init__(self)
|
|
self.input_array = input_array
|
|
self.output_array = None
|
|
|
|
def run(self):
|
|
self.dev = drv.Device(0)
|
|
self.context = self.dev.make_context()
|
|
|
|
self.ker = SourceModule(kernel_code)
|
|
|
|
self.mult_ker = self.ker.get_function('mult_ker')
|
|
|
|
self.array_gpu = gpuarray.to_gpu(self.input_array)
|
|
|
|
self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
|
|
|
|
self.output_array = self.array_gpu.get()
|
|
|
|
self.context.pop()
|
|
|
|
def join(self):
|
|
threading.Thread.join(self)
|
|
return self.output_array
|
|
|
|
drv.init()
|
|
|
|
|
|
data = []
|
|
gpu_out = []
|
|
threads = []
|
|
|
|
# generate random arrays and thread objects.
|
|
for _ in range(num_arrays):
|
|
data.append(np.random.randn(array_len).astype('float32'))
|
|
|
|
for k in range(num_arrays):
|
|
# create a thread that uses data we just generated
|
|
threads.append(KernelLauncherThread(data[k]))
|
|
|
|
# launch threads to process arrays.
|
|
for k in range(num_arrays):
|
|
threads[k].start()
|
|
|
|
# get data from launched threads.
|
|
for k in range(num_arrays):
|
|
gpu_out.append(threads[k].join())
|
|
|
|
for k in range(num_arrays):
|
|
assert (np.allclose(gpu_out[k], data[k]))
|
|
|