Create naive_prefix.py

2025-08-15 07:00:06 +02:00 · 2020-02-18 12:52:33 -08:00
parent 0c06aadbad
commit 557759227a
1 changed files with 52 additions and 0 deletions
--- a/Chapter04/naive_prefix.py
+++ b/Chapter04/naive_prefix.py
@@ -0,0 +1,52 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy as np
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+from time import time
+# this is a naive parallel prefix-sum kernel that uses shared memory
+naive_ker = SourceModule("""
+__global__ void naive_prefix(double *vec, double *out)
+{
+     __shared__ double sum_buf[1024];     
+     int tid = threadIdx.x;     
+     sum_buf[tid] = vec[tid];
+     
+     // begin parallel prefix sum algorithm
+     
+     int iter = 1;
+     for (int i=0; i < 10; i++)
+     {
+         __syncthreads();
+         if (tid >= iter )
+         {
+             sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter];            
+         }
+         
+         iter *= 2;
+     }
+         
+    __syncthreads();
+    out[tid] = sum_buf[tid];
+    __syncthreads();
+        
+}
+""")
+naive_gpu = naive_ker.get_function("naive_prefix")
+    
+
+
+if __name__ == '__main__':
+    
+    
+    testvec = np.random.randn(1024).astype(np.float64)
+    testvec_gpu = gpuarray.to_gpu(testvec)
+    
+    outvec_gpu = gpuarray.empty_like(testvec_gpu)
+
+    naive_gpu( testvec_gpu , outvec_gpu, block=(1024,1,1), grid=(1,1,1))
+    
+    total_sum = sum( testvec)
+    total_sum_gpu = outvec_gpu[-1].get()
+    
+    print("Does our kernel work correctly? : {}".format(np.allclose(total_sum_gpu , total_sum) ))