diff --git a/Chapter06/broken_matrix_ker.py b/Chapter06/broken_matrix_ker.py new file mode 100644 index 0000000..eb11c55 --- /dev/null +++ b/Chapter06/broken_matrix_ker.py @@ -0,0 +1,79 @@ +# Note: this code is intentionally broken!!! +# (This is intended to show a case study of how to debug CUDA code +# using printf.) + +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np + + +ker = SourceModule(''' +// row-column dot-product for matrix multiplication +__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) +{ + + //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N); + float val = 0; + + for (int k=0; k < N; k++) + { + + // broken version + val += matrix_a[ row + k*N ] * matrix_b[ col*N + k]; + + //if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) + // printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]); + + // fixed version + //val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; + } + + return(val); + +} + +// matrix multiplication kernel that is parallelized over row/column tuples. +__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) +{ + + // broken version + int row = blockIdx.x + threadIdx.x; + int col = blockIdx.y + threadIdx.y; + + // fixed version + //int row = blockIdx.x*blockDim.x + threadIdx.x; + //int col = blockIdx.y*blockDim.y + threadIdx.y; + + //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col); + + + // broken version + output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, col, row, N); + + // fixed version + //output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); + + +} +''') + +matrix_ker = ker.get_function('matrix_mult_ker') + +test_a = np.float32( [range(1,5)] * 4 ) +test_b = np.float32([range(14,10, -1)]*4 ) + +output_mat = np.matmul(test_a, test_b) + +test_a_gpu = gpuarray.to_gpu(test_a) +test_b_gpu = gpuarray.to_gpu(test_b) +output_mat_gpu = gpuarray.empty_like(test_a_gpu) + +matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1)) + +assert( np.allclose(output_mat_gpu.get(), output_mat) ) + + + + diff --git a/Chapter06/divergence_test.cu b/Chapter06/divergence_test.cu new file mode 100644 index 0000000..78dcb93 --- /dev/null +++ b/Chapter06/divergence_test.cu @@ -0,0 +1,18 @@ +#include +#include + +__global__ void divergence_test_ker() +{ + if( threadIdx.x % 2 == 0) + printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x); + else + printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x); +} + +__host__ int main() +{ + cudaSetDevice(0); + divergence_test_ker<<<1, 32>>>(); + cudaDeviceSynchronize(); + cudaDeviceReset(); +} diff --git a/Chapter06/hello-world_gpu.py b/Chapter06/hello-world_gpu.py new file mode 100644 index 0000000..3cfd425 --- /dev/null +++ b/Chapter06/hello-world_gpu.py @@ -0,0 +1,23 @@ +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule + +ker = SourceModule(''' +__global__ void hello_world_ker() +{ + printf("Hello world from thread %d, in block %d!\\n", threadIdx.x, blockIdx.x); + + __syncthreads(); + + if(threadIdx.x == 0 && blockIdx.x == 0) + { + printf("-------------------------------------\\n"); + printf("This kernel was launched over a grid consisting of %d blocks,\\n", gridDim.x); + printf("where each block has %d threads.\\n", blockDim.x); + } +} +''') + +hello_ker = ker.get_function("hello_world_ker") +hello_ker( block=(5,1,1), grid=(2,1,1) ) diff --git a/Chapter06/matrix_ker.cu b/Chapter06/matrix_ker.cu new file mode 100644 index 0000000..9ea76d3 --- /dev/null +++ b/Chapter06/matrix_ker.cu @@ -0,0 +1,143 @@ +#include +#include +#include +#define _EPSILON 0.001 +#define _ABS(x) ( x > 0.0f ? x : -x ) + +__host__ int allclose(float *A, float *B, int len) +{ + + int returnval = 0; + + for (int i = 0; i < len; i++) + { + if ( _ABS(A[i] - B[i]) > _EPSILON ) + { + returnval = -1; + break; + } + } + + return(returnval); +} + + +// row-column dot-product for matrix multiplication +__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) +{ + float val = 0; + + for (int k=0; k < N; k++) + { + val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; + } + + return(val); +} + +// matrix multiplication kernel that is parallelized over row/column tuples. +__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) +{ + + int row = blockIdx.x*blockDim.x + threadIdx.x; + int col = blockIdx.y*blockDim.y + threadIdx.y; + + output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); +} + + +__host__ int main() +{ + + // Initialize to use first GPU. + cudaSetDevice(0); + + // this indicates the width/height of the matrices + int N = 4; + + // this will indicate how many bytes to allocate to store a test or output matrix + int num_bytes = sizeof(float)*N*N; + + // input test matrix A + float h_A[] = { 1.0, 2.0, 3.0, 4.0, \ + 1.0, 2.0, 3.0, 4.0, \ + 1.0, 2.0, 3.0, 4.0, \ + 1.0, 2.0, 3.0, 4.0 }; + + // input test matrix B + float h_B[] = { 14.0, 13.0, 12.0, 11.0, \ + 14.0, 13.0, 12.0, 11.0, \ + 14.0, 13.0, 12.0, 11.0, \ + 14.0, 13.0, 12.0, 11.0 }; + + // expected output of A times B + float h_AxB[] = { 140.0, 130.0, 120.0, 110.0, \ + 140.0, 130.0, 120.0, 110.0, \ + 140.0, 130.0, 120.0, 110.0, \ + 140.0, 130.0, 120.0, 110.0 }; + + + // these pointers will be used for the GPU. + // (notice how we use normal float pointers) + float * d_A; + float * d_B; + float * d_output; + + // allocate memory for the test matrices on the GPU + cudaMalloc((float **) &d_A, num_bytes); + cudaMalloc((float **) &d_B, num_bytes); + + // copy the test matrices to the GPU + cudaMemcpy(d_A, h_A, num_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, num_bytes, cudaMemcpyHostToDevice); + + // allocate memory for output on GPU + cudaMalloc((float **) &d_output, num_bytes); + + // this will store the output from the GPU + float * h_output; + h_output = (float *) malloc(num_bytes); + + // setup our block and grid launch parameters with the dim3 class. + dim3 block(2,2,1); + dim3 grid(2,2,1); + + // launch our kernel + matrix_mult_ker <<< grid, block >>> (d_A, d_B, d_output, N); + + // synchronize on the host, to ensure our kernel has finished executing. + cudaDeviceSynchronize(); + + // copy output from device to host. + cudaMemcpy(h_output, d_output, num_bytes, cudaMemcpyDeviceToHost); + + // synchronize again. + cudaDeviceSynchronize(); + + // free arrays on device. + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_output); + + // reset the GPU. + cudaDeviceReset(); + + + // Check to see if we got the expected output. + // in both cases, remember to de-allocate h_output before returning. + + if (allclose(h_AxB, h_output, N*N) < 0) + { + printf("Error! Output of kernel does not match expected output.\n"); + free(h_output); + return(-1); + } + else + { + printf("Success! Output of kernel matches expected output.\n"); + free(h_output); + return(0); + } + + +} diff --git a/Chapter06/matrix_ker.py b/Chapter06/matrix_ker.py new file mode 100644 index 0000000..9878787 --- /dev/null +++ b/Chapter06/matrix_ker.py @@ -0,0 +1,57 @@ +# This program is the "fixed" version of broken_matrix_ker.py + +# This is to be used for an exercise where this code is translated to +# a pure CUDA-C version. + +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np + + +ker = SourceModule(''' +// row-column dot-product for matrix multiplication +__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) +{ + float val = 0; + + for (int k=0; k < N; k++) + { + val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; + } + + return(val); + +} + +// matrix multiplication kernel that is parallelized over row/column tuples. +__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) +{ + + int row = blockIdx.x*blockDim.x + threadIdx.x; + int col = blockIdx.y*blockDim.y + threadIdx.y; + + output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); + +} +''') + +matrix_ker = ker.get_function('matrix_mult_ker') + +test_a = np.float32([range(1,5)] * 4) +test_b = np.float32([range(14,10, -1)]*4 ) + +output_mat = np.matmul(test_a, test_b) + +test_a_gpu = gpuarray.to_gpu(test_a) +test_b_gpu = gpuarray.to_gpu(test_b) +output_mat_gpu = gpuarray.empty_like(test_a_gpu) + +matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1)) + +assert(np.allclose(output_mat_gpu.get(), output_mat) ) + + + +