added chapter06 (everything thus far works on Python 3, btw...)

2025-07-21 04:41:05 +02:00 · 2020-02-18 13:12:53 -08:00
parent 34f4d14daa
commit 9d4547b2a3
5 changed files with 320 additions and 0 deletions
--- a/Chapter06/broken_matrix_ker.py
+++ b/Chapter06/broken_matrix_ker.py
@@ -0,0 +1,79 @@
+# Note: this code is intentionally broken!!!
+# (This is intended to show a case study of how to debug CUDA code
+# using printf.)
+
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+
+
+ker = SourceModule('''
+// row-column dot-product for matrix multiplication
+__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
+{
+
+    //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N);
+	float val = 0;
+	
+	for (int k=0; k < N; k++)
+	{
+    
+        // broken version
+        val += matrix_a[ row + k*N ] * matrix_b[ col*N + k];
+
+        //if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0)
+        //    printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]);
+		
+        // fixed version
+        //val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
+	}
+	
+	return(val);
+
+}
+
+// matrix multiplication kernel that is parallelized over row/column tuples.
+__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
+{
+
+   // broken version
+   int row = blockIdx.x + threadIdx.x;
+	int col = blockIdx.y + threadIdx.y;
+    
+    // fixed version
+    //int row = blockIdx.x*blockDim.x + threadIdx.x;
+    //int col = blockIdx.y*blockDim.y + threadIdx.y;
+    
+    //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col);
+
+	
+    // broken version
+    output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, col, row, N);
+    
+    // fixed version
+	//output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
+
+
+}
+''')
+
+matrix_ker = ker.get_function('matrix_mult_ker')
+
+test_a = np.float32( [range(1,5)] * 4 )
+test_b = np.float32([range(14,10, -1)]*4 )
+
+output_mat = np.matmul(test_a, test_b)
+
+test_a_gpu = gpuarray.to_gpu(test_a)
+test_b_gpu = gpuarray.to_gpu(test_b)
+output_mat_gpu = gpuarray.empty_like(test_a_gpu)
+
+matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
+
+assert( np.allclose(output_mat_gpu.get(), output_mat) )
+
+
+
+
--- a/Chapter06/divergence_test.cu
+++ b/Chapter06/divergence_test.cu
@@ -0,0 +1,18 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void divergence_test_ker()
+{
+	if( threadIdx.x % 2 == 0)
+		printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x);
+	else
+		printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x);
+}
+
+__host__ int main()
+{
+	cudaSetDevice(0);
+	divergence_test_ker<<<1, 32>>>();
+	cudaDeviceSynchronize();
+	cudaDeviceReset();
+}
--- a/Chapter06/hello-world_gpu.py
+++ b/Chapter06/hello-world_gpu.py
@@ -0,0 +1,23 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+
+ker = SourceModule('''
+__global__ void hello_world_ker()
+{
+	printf("Hello world from thread %d, in block %d!\\n", threadIdx.x, blockIdx.x);
+	
+	__syncthreads();
+	
+	if(threadIdx.x == 0 && blockIdx.x == 0)
+	{
+		printf("-------------------------------------\\n");
+		printf("This kernel was launched over a grid consisting of %d blocks,\\n", gridDim.x);
+		printf("where each block has %d threads.\\n", blockDim.x);
+	}
+}
+''')
+
+hello_ker = ker.get_function("hello_world_ker")
+hello_ker(  block=(5,1,1), grid=(2,1,1) )
--- a/Chapter06/matrix_ker.cu
+++ b/Chapter06/matrix_ker.cu
@@ -0,0 +1,143 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define _EPSILON 0.001
+#define _ABS(x)	( x > 0.0f ? x : -x )
+
+__host__ int allclose(float *A, float *B, int len)
+{
+
+	int returnval = 0;
+	
+	for (int i = 0; i < len; i++)
+	{
+		if ( _ABS(A[i] - B[i]) > _EPSILON )
+		{
+			returnval = -1;
+			break;
+		}
+	}
+	
+	return(returnval);
+}
+
+
+// row-column dot-product for matrix multiplication
+__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
+{
+	float val = 0;
+	
+	for (int k=0; k < N; k++)
+	{
+        val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
+	}
+	
+	return(val);
+}
+
+// matrix multiplication kernel that is parallelized over row/column tuples.
+__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
+{
+
+    int row = blockIdx.x*blockDim.x + threadIdx.x;
+    int col = blockIdx.y*blockDim.y + threadIdx.y;
+
+	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
+}
+
+
+__host__ int main()
+{
+
+	// Initialize to use first GPU.
+	cudaSetDevice(0);
+
+	// this indicates the width/height of the matrices
+	int N = 4;
+	
+	// this will indicate how many bytes to allocate to store a test or output matrix
+	int num_bytes = sizeof(float)*N*N;
+	
+	// input test matrix A
+	float h_A[] = {	1.0,  2.0,  3.0,  4.0, \
+					1.0,  2.0,  3.0,  4.0, \
+					1.0,  2.0,  3.0,  4.0, \
+					1.0,  2.0,  3.0,  4.0 };
+					
+	// input test matrix B
+	float h_B[] = {	14.0,  13.0,  12.0,  11.0, \
+					14.0,  13.0,  12.0,  11.0, \
+					14.0,  13.0,  12.0,  11.0, \
+					14.0,  13.0,  12.0,  11.0 };
+	
+	// expected output of A times B
+	float h_AxB[] = { 140.0,  130.0,  120.0,  110.0, \
+					140.0,  130.0,  120.0,  110.0, \
+					140.0,  130.0,  120.0,  110.0, \
+					140.0,  130.0,  120.0,  110.0 };
+					
+					
+	// these pointers will be used for the GPU.
+	// (notice how we use normal float pointers)
+	float * d_A;
+	float * d_B;
+	float * d_output;
+	
+	// allocate memory for the test matrices on the GPU
+	cudaMalloc((float **) &d_A, num_bytes);
+	cudaMalloc((float **) &d_B, num_bytes);
+	
+	// copy the test matrices to the GPU
+	cudaMemcpy(d_A, h_A, num_bytes, cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, h_B, num_bytes, cudaMemcpyHostToDevice);
+	
+	// allocate memory for output on GPU
+	cudaMalloc((float **) &d_output, num_bytes);
+	
+	// this will store the output from the GPU
+	float * h_output;
+	h_output = (float *) malloc(num_bytes);
+
+	// setup our block and grid launch parameters with the dim3 class.
+	dim3 block(2,2,1);
+	dim3 grid(2,2,1);
+	
+	// launch our kernel
+	matrix_mult_ker <<< grid, block >>> (d_A, d_B, d_output, N);
+	
+	// synchronize on the host, to ensure our kernel has finished executing.
+	cudaDeviceSynchronize();
+	
+	// copy output from device to host.
+	cudaMemcpy(h_output, d_output, num_bytes, cudaMemcpyDeviceToHost);
+
+	// synchronize again.
+	cudaDeviceSynchronize();
+	
+	// free arrays on device.
+	cudaFree(d_A);
+	cudaFree(d_B);
+	cudaFree(d_output);
+	
+	// reset the GPU.
+	cudaDeviceReset();
+	
+	
+	// Check to see if we got the expected output.
+	// in both cases, remember to de-allocate h_output before returning.
+	
+	if (allclose(h_AxB, h_output, N*N) < 0)
+	{
+		printf("Error!  Output of kernel does not match expected output.\n");
+		free(h_output);
+		return(-1);
+	}
+	else
+	{
+		printf("Success!  Output of kernel matches expected output.\n");
+		free(h_output);
+		return(0);
+	}
+
+
+}
--- a/Chapter06/matrix_ker.py
+++ b/Chapter06/matrix_ker.py
@@ -0,0 +1,57 @@
+# This program is the "fixed" version of broken_matrix_ker.py
+
+# This is to be used for an exercise where this code is translated to
+# a pure CUDA-C version.
+
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+
+
+ker = SourceModule('''
+// row-column dot-product for matrix multiplication
+__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
+{
+	float val = 0;
+	
+	for (int k=0; k < N; k++)
+	{
+        val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
+	}
+	
+	return(val);
+
+}
+
+// matrix multiplication kernel that is parallelized over row/column tuples.
+__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
+{
+
+    int row = blockIdx.x*blockDim.x + threadIdx.x;
+    int col = blockIdx.y*blockDim.y + threadIdx.y;
+
+	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
+
+}
+''')
+
+matrix_ker = ker.get_function('matrix_mult_ker')
+
+test_a = np.float32([range(1,5)] * 4)
+test_b = np.float32([range(14,10, -1)]*4 )
+
+output_mat = np.matmul(test_a, test_b)
+
+test_a_gpu = gpuarray.to_gpu(test_a)
+test_b_gpu = gpuarray.to_gpu(test_b)
+output_mat_gpu = gpuarray.empty_like(test_a_gpu)
+
+matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
+
+assert(np.allclose(output_mat_gpu.get(), output_mat) )
+
+
+
+