mirror of
https://github.com/PacktPublishing/Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition.git
synced 2025-07-21 04:41:05 +02:00
added chapter06 (everything thus far works on Python 3, btw...)
This commit is contained in:
79
Chapter06/broken_matrix_ker.py
Normal file
79
Chapter06/broken_matrix_ker.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# Note: this code is intentionally broken!!!
|
||||
# (This is intended to show a case study of how to debug CUDA code
|
||||
# using printf.)
|
||||
|
||||
import pycuda.autoinit
|
||||
import pycuda.driver as drv
|
||||
from pycuda import gpuarray
|
||||
from pycuda.compiler import SourceModule
|
||||
import numpy as np
|
||||
|
||||
|
||||
ker = SourceModule('''
|
||||
// row-column dot-product for matrix multiplication
|
||||
__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
|
||||
{
|
||||
|
||||
//printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N);
|
||||
float val = 0;
|
||||
|
||||
for (int k=0; k < N; k++)
|
||||
{
|
||||
|
||||
// broken version
|
||||
val += matrix_a[ row + k*N ] * matrix_b[ col*N + k];
|
||||
|
||||
//if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0)
|
||||
// printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]);
|
||||
|
||||
// fixed version
|
||||
//val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
|
||||
}
|
||||
|
||||
return(val);
|
||||
|
||||
}
|
||||
|
||||
// matrix multiplication kernel that is parallelized over row/column tuples.
|
||||
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
|
||||
{
|
||||
|
||||
// broken version
|
||||
int row = blockIdx.x + threadIdx.x;
|
||||
int col = blockIdx.y + threadIdx.y;
|
||||
|
||||
// fixed version
|
||||
//int row = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
//int col = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
//printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col);
|
||||
|
||||
|
||||
// broken version
|
||||
output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, col, row, N);
|
||||
|
||||
// fixed version
|
||||
//output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
|
||||
|
||||
|
||||
}
|
||||
''')
|
||||
|
||||
matrix_ker = ker.get_function('matrix_mult_ker')
|
||||
|
||||
test_a = np.float32( [range(1,5)] * 4 )
|
||||
test_b = np.float32([range(14,10, -1)]*4 )
|
||||
|
||||
output_mat = np.matmul(test_a, test_b)
|
||||
|
||||
test_a_gpu = gpuarray.to_gpu(test_a)
|
||||
test_b_gpu = gpuarray.to_gpu(test_b)
|
||||
output_mat_gpu = gpuarray.empty_like(test_a_gpu)
|
||||
|
||||
matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
|
||||
|
||||
assert( np.allclose(output_mat_gpu.get(), output_mat) )
|
||||
|
||||
|
||||
|
||||
|
18
Chapter06/divergence_test.cu
Normal file
18
Chapter06/divergence_test.cu
Normal file
@@ -0,0 +1,18 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void divergence_test_ker()
|
||||
{
|
||||
if( threadIdx.x % 2 == 0)
|
||||
printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x);
|
||||
else
|
||||
printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x);
|
||||
}
|
||||
|
||||
__host__ int main()
|
||||
{
|
||||
cudaSetDevice(0);
|
||||
divergence_test_ker<<<1, 32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
cudaDeviceReset();
|
||||
}
|
23
Chapter06/hello-world_gpu.py
Normal file
23
Chapter06/hello-world_gpu.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import pycuda.autoinit
|
||||
import pycuda.driver as drv
|
||||
from pycuda import gpuarray
|
||||
from pycuda.compiler import SourceModule
|
||||
|
||||
ker = SourceModule('''
|
||||
__global__ void hello_world_ker()
|
||||
{
|
||||
printf("Hello world from thread %d, in block %d!\\n", threadIdx.x, blockIdx.x);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x == 0 && blockIdx.x == 0)
|
||||
{
|
||||
printf("-------------------------------------\\n");
|
||||
printf("This kernel was launched over a grid consisting of %d blocks,\\n", gridDim.x);
|
||||
printf("where each block has %d threads.\\n", blockDim.x);
|
||||
}
|
||||
}
|
||||
''')
|
||||
|
||||
hello_ker = ker.get_function("hello_world_ker")
|
||||
hello_ker( block=(5,1,1), grid=(2,1,1) )
|
143
Chapter06/matrix_ker.cu
Normal file
143
Chapter06/matrix_ker.cu
Normal file
@@ -0,0 +1,143 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#define _EPSILON 0.001
|
||||
#define _ABS(x) ( x > 0.0f ? x : -x )
|
||||
|
||||
__host__ int allclose(float *A, float *B, int len)
|
||||
{
|
||||
|
||||
int returnval = 0;
|
||||
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
if ( _ABS(A[i] - B[i]) > _EPSILON )
|
||||
{
|
||||
returnval = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return(returnval);
|
||||
}
|
||||
|
||||
|
||||
// row-column dot-product for matrix multiplication
|
||||
__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
|
||||
{
|
||||
float val = 0;
|
||||
|
||||
for (int k=0; k < N; k++)
|
||||
{
|
||||
val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
|
||||
}
|
||||
|
||||
return(val);
|
||||
}
|
||||
|
||||
// matrix multiplication kernel that is parallelized over row/column tuples.
|
||||
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
|
||||
{
|
||||
|
||||
int row = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
int col = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
|
||||
}
|
||||
|
||||
|
||||
__host__ int main()
|
||||
{
|
||||
|
||||
// Initialize to use first GPU.
|
||||
cudaSetDevice(0);
|
||||
|
||||
// this indicates the width/height of the matrices
|
||||
int N = 4;
|
||||
|
||||
// this will indicate how many bytes to allocate to store a test or output matrix
|
||||
int num_bytes = sizeof(float)*N*N;
|
||||
|
||||
// input test matrix A
|
||||
float h_A[] = { 1.0, 2.0, 3.0, 4.0, \
|
||||
1.0, 2.0, 3.0, 4.0, \
|
||||
1.0, 2.0, 3.0, 4.0, \
|
||||
1.0, 2.0, 3.0, 4.0 };
|
||||
|
||||
// input test matrix B
|
||||
float h_B[] = { 14.0, 13.0, 12.0, 11.0, \
|
||||
14.0, 13.0, 12.0, 11.0, \
|
||||
14.0, 13.0, 12.0, 11.0, \
|
||||
14.0, 13.0, 12.0, 11.0 };
|
||||
|
||||
// expected output of A times B
|
||||
float h_AxB[] = { 140.0, 130.0, 120.0, 110.0, \
|
||||
140.0, 130.0, 120.0, 110.0, \
|
||||
140.0, 130.0, 120.0, 110.0, \
|
||||
140.0, 130.0, 120.0, 110.0 };
|
||||
|
||||
|
||||
// these pointers will be used for the GPU.
|
||||
// (notice how we use normal float pointers)
|
||||
float * d_A;
|
||||
float * d_B;
|
||||
float * d_output;
|
||||
|
||||
// allocate memory for the test matrices on the GPU
|
||||
cudaMalloc((float **) &d_A, num_bytes);
|
||||
cudaMalloc((float **) &d_B, num_bytes);
|
||||
|
||||
// copy the test matrices to the GPU
|
||||
cudaMemcpy(d_A, h_A, num_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_B, h_B, num_bytes, cudaMemcpyHostToDevice);
|
||||
|
||||
// allocate memory for output on GPU
|
||||
cudaMalloc((float **) &d_output, num_bytes);
|
||||
|
||||
// this will store the output from the GPU
|
||||
float * h_output;
|
||||
h_output = (float *) malloc(num_bytes);
|
||||
|
||||
// setup our block and grid launch parameters with the dim3 class.
|
||||
dim3 block(2,2,1);
|
||||
dim3 grid(2,2,1);
|
||||
|
||||
// launch our kernel
|
||||
matrix_mult_ker <<< grid, block >>> (d_A, d_B, d_output, N);
|
||||
|
||||
// synchronize on the host, to ensure our kernel has finished executing.
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// copy output from device to host.
|
||||
cudaMemcpy(h_output, d_output, num_bytes, cudaMemcpyDeviceToHost);
|
||||
|
||||
// synchronize again.
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// free arrays on device.
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_output);
|
||||
|
||||
// reset the GPU.
|
||||
cudaDeviceReset();
|
||||
|
||||
|
||||
// Check to see if we got the expected output.
|
||||
// in both cases, remember to de-allocate h_output before returning.
|
||||
|
||||
if (allclose(h_AxB, h_output, N*N) < 0)
|
||||
{
|
||||
printf("Error! Output of kernel does not match expected output.\n");
|
||||
free(h_output);
|
||||
return(-1);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Success! Output of kernel matches expected output.\n");
|
||||
free(h_output);
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
57
Chapter06/matrix_ker.py
Normal file
57
Chapter06/matrix_ker.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# This program is the "fixed" version of broken_matrix_ker.py
|
||||
|
||||
# This is to be used for an exercise where this code is translated to
|
||||
# a pure CUDA-C version.
|
||||
|
||||
import pycuda.autoinit
|
||||
import pycuda.driver as drv
|
||||
from pycuda import gpuarray
|
||||
from pycuda.compiler import SourceModule
|
||||
import numpy as np
|
||||
|
||||
|
||||
ker = SourceModule('''
|
||||
// row-column dot-product for matrix multiplication
|
||||
__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
|
||||
{
|
||||
float val = 0;
|
||||
|
||||
for (int k=0; k < N; k++)
|
||||
{
|
||||
val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
|
||||
}
|
||||
|
||||
return(val);
|
||||
|
||||
}
|
||||
|
||||
// matrix multiplication kernel that is parallelized over row/column tuples.
|
||||
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
|
||||
{
|
||||
|
||||
int row = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
int col = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
|
||||
|
||||
}
|
||||
''')
|
||||
|
||||
matrix_ker = ker.get_function('matrix_mult_ker')
|
||||
|
||||
test_a = np.float32([range(1,5)] * 4)
|
||||
test_b = np.float32([range(14,10, -1)]*4 )
|
||||
|
||||
output_mat = np.matmul(test_a, test_b)
|
||||
|
||||
test_a_gpu = gpuarray.to_gpu(test_a)
|
||||
test_b_gpu = gpuarray.to_gpu(test_b)
|
||||
output_mat_gpu = gpuarray.empty_like(test_a_gpu)
|
||||
|
||||
matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
|
||||
|
||||
assert(np.allclose(output_mat_gpu.get(), output_mat) )
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user