shared memory only kernel

divyegala · divyegala · commit 145d8c2d4962 · 2020-10-12T18:30:38.000-05:00
diff --git a/cpp/include/learning_cuda/2d_heat/2d_heat.cuh b/cpp/include/learning_cuda/2d_heat/2d_heat.cuh
@@ -42,4 +42,24 @@ void heat_diffusion(T *data_d_old, T *data_d_new, int nx, int ny, int iter) {
 
 }
 
-} // namespace shared_global
+} // namespace shared_global
+
+namespace shared_only {
+
+template<typename T, int TPB>
+void heat_diffusion(T *data_d_old, T *data_d_new, int nx, int ny, int iter) {
+    dim3 block_size(TPB, TPB);
+    dim3 grid_size(std::ceil((float) nx / (TPB - 2)),
+                    std::ceil((float) ny / (TPB - 2)));
+    for(int i = 0; i < iter; i += 2) {
+        detail::heat_kernel<T, TPB><<<grid_size, block_size>>> (data_d_new,
+                                                                data_d_old,
+                                                                nx, ny);
+        detail::heat_kernel<T, TPB><<<grid_size, block_size>>> (data_d_old,
+                                                                data_d_new,
+                                                                nx, ny);
+    }
+
+}
+
+} // namespace shared_only
diff --git a/cpp/include/learning_cuda/2d_heat/2d_heat_kernels.cuh b/cpp/include/learning_cuda/2d_heat/2d_heat_kernels.cuh
@@ -71,4 +71,39 @@ void heat_kernel(const T * __restrict__ data_d_old, T * __restrict__ data_d_new,
 }
 
 } // namespace detail
-} // namespace shared_global
+} // namespace shared_global
+
+namespace shared_only {
+namespace detail {
+    
+template <typename T, int TPB>
+__global__
+void heat_kernel(const T * __restrict__ data_d_old, T * __restrict__ data_d_new,
+                    int nx, int ny) {
+    int i = threadIdx.x + (TPB - 2) * blockIdx.x;
+    int j = threadIdx.y + (TPB - 2) * blockIdx.y;
+
+    __shared__ T data_shared[TPB][TPB];
+
+    int self_idx = i * ny + j;
+
+    if (i < nx && j < ny) {
+        data_shared[threadIdx.x][threadIdx.y] = data_d_old[self_idx];
+    }
+    __syncthreads();
+
+    if ((i > 0 && i < nx - 1) && (j > 0 && j < ny - 1)) {
+        if ((threadIdx.x > 0 && threadIdx.x < TPB - 1) && (threadIdx.y > 0 && threadIdx.y < TPB - 1))
+        {
+            // pick from shared memory
+            data_d_new[self_idx] = 0.25 * (data_shared[threadIdx.x - 1][threadIdx.y] + 
+                                           data_shared[threadIdx.x + 1][threadIdx.y] +
+                                           data_shared[threadIdx.x][threadIdx.y - 1] + 
+                                           data_shared[threadIdx.x][threadIdx.y + 1]);
+        }
+    }
+
+}
+
+} // namespace detail
+} // namespace shared_only
diff --git a/cpp/src/2d_heat/2d_heat.cu b/cpp/src/2d_heat/2d_heat.cu
@@ -16,45 +16,63 @@ void _initialize_temps(T *data_d, int nx, int ny) {
 
 int main() {
 
-    int nx = 10, ny = 3;
+    int nx = 2000, ny = 2000;
     int NBLK = std::ceil((float) nx / 32);
-    int iter = 2;
+    int iter = 100;
 
     // naive
-    thrust::device_vector<double> data_d_old(nx * ny, 0);
+    thrust::device_vector<float> data_d_old(nx * ny, 0);
 
     _initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old.data()),
                                      nx, ny);
 
-    thrust::device_vector<double> data_d_new = data_d_old;
+    thrust::device_vector<float> data_d_new = data_d_old;
 
     cudaDeviceSynchronize();
-    naive::heat_diffusion<double, 16>(thrust::raw_pointer_cast(data_d_old.data()),
+    naive::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old.data()),
                                       thrust::raw_pointer_cast(data_d_new.data()),
                                       nx, ny, iter);
-
+    cudaDeviceSynchronize();
     // Printing device vector
-    std::cout << "\n Naive: \n";
-    thrust::copy(data_d_new.begin(), data_d_new.end(),
-                 std::ostream_iterator<double>(std::cout, " "));
+    // std::cout << "\n Naive: \n";
+    // thrust::copy(data_d_new.begin(), data_d_new.end(),
+    //              std::ostream_iterator<double>(std::cout, " "));
 
     // shared_global
-    thrust::device_vector<double> data_d_old_sg(nx * ny, 0);
+    thrust::device_vector<float> data_d_old_sg(nx * ny, 0);
 
     _initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old_sg.data()),
                                      nx, ny);
 
-    thrust::device_vector<double> data_d_new_sg = data_d_old_sg;
+    thrust::device_vector<float> data_d_new_sg = data_d_old_sg;
 
     cudaDeviceSynchronize();
-    shared_global::heat_diffusion<double, 16>(thrust::raw_pointer_cast(data_d_old_sg.data()),
+    shared_global::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old_sg.data()),
                                               thrust::raw_pointer_cast(data_d_new_sg.data()),
                                               nx, ny, iter);
+    cudaDeviceSynchronize();
+    // Printing device vector
+    // std::cout << "\n Shared & Global: \n";
+    // thrust::copy(data_d_new_sg.begin(), data_d_new_sg.end(),
+    //              std::ostream_iterator<double>(std::cout, " "));
+
+    // shared_only
+    thrust::device_vector<float> data_d_old_so(nx * ny, 0);
 
+    _initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old_so.data()),
+                                     nx, ny);
+
+    thrust::device_vector<float> data_d_new_so = data_d_old_so;
+
+    cudaDeviceSynchronize();
+    shared_only::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old_so.data()),
+                                              thrust::raw_pointer_cast(data_d_new_so.data()),
+                                              nx, ny, iter);
+    cudaDeviceSynchronize();
     // Printing device vector
-    std::cout << "\n Shared & Global: \n";
-    thrust::copy(data_d_new_sg.begin(), data_d_new_sg.end(),
-                 std::ostream_iterator<double>(std::cout, " "));
+    // std::cout << "\n Shared Only: \n";
+    // thrust::copy(data_d_new_so.begin(), data_d_new_so.end(),
+    //              std::ostream_iterator<double>(std::cout, " "));
 
     return 0;
 }