Skip to content

Commit 145d8c2

Browse files
committedOct 12, 2020
shared memory only kernel
1 parent 55e4754 commit 145d8c2

File tree

3 files changed

+90
-17
lines changed

3 files changed

+90
-17
lines changed
 

‎cpp/include/learning_cuda/2d_heat/2d_heat.cuh

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,24 @@ void heat_diffusion(T *data_d_old, T *data_d_new, int nx, int ny, int iter) {
4242

4343
}
4444

45-
} // namespace shared_global
45+
} // namespace shared_global
46+
47+
namespace shared_only {
48+
49+
template<typename T, int TPB>
50+
void heat_diffusion(T *data_d_old, T *data_d_new, int nx, int ny, int iter) {
51+
dim3 block_size(TPB, TPB);
52+
dim3 grid_size(std::ceil((float) nx / (TPB - 2)),
53+
std::ceil((float) ny / (TPB - 2)));
54+
for(int i = 0; i < iter; i += 2) {
55+
detail::heat_kernel<T, TPB><<<grid_size, block_size>>> (data_d_new,
56+
data_d_old,
57+
nx, ny);
58+
detail::heat_kernel<T, TPB><<<grid_size, block_size>>> (data_d_old,
59+
data_d_new,
60+
nx, ny);
61+
}
62+
63+
}
64+
65+
} // namespace shared_only

‎cpp/include/learning_cuda/2d_heat/2d_heat_kernels.cuh

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,39 @@ void heat_kernel(const T * __restrict__ data_d_old, T * __restrict__ data_d_new,
7171
}
7272

7373
} // namespace detail
74-
} // namespace shared_global
74+
} // namespace shared_global
75+
76+
namespace shared_only {
77+
namespace detail {
78+
79+
template <typename T, int TPB>
80+
__global__
81+
void heat_kernel(const T * __restrict__ data_d_old, T * __restrict__ data_d_new,
82+
int nx, int ny) {
83+
int i = threadIdx.x + (TPB - 2) * blockIdx.x;
84+
int j = threadIdx.y + (TPB - 2) * blockIdx.y;
85+
86+
__shared__ T data_shared[TPB][TPB];
87+
88+
int self_idx = i * ny + j;
89+
90+
if (i < nx && j < ny) {
91+
data_shared[threadIdx.x][threadIdx.y] = data_d_old[self_idx];
92+
}
93+
__syncthreads();
94+
95+
if ((i > 0 && i < nx - 1) && (j > 0 && j < ny - 1)) {
96+
if ((threadIdx.x > 0 && threadIdx.x < TPB - 1) && (threadIdx.y > 0 && threadIdx.y < TPB - 1))
97+
{
98+
// pick from shared memory
99+
data_d_new[self_idx] = 0.25 * (data_shared[threadIdx.x - 1][threadIdx.y] +
100+
data_shared[threadIdx.x + 1][threadIdx.y] +
101+
data_shared[threadIdx.x][threadIdx.y - 1] +
102+
data_shared[threadIdx.x][threadIdx.y + 1]);
103+
}
104+
}
105+
106+
}
107+
108+
} // namespace detail
109+
} // namespace shared_only

‎cpp/src/2d_heat/2d_heat.cu

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,45 +16,63 @@ void _initialize_temps(T *data_d, int nx, int ny) {
1616

1717
int main() {
1818

19-
int nx = 10, ny = 3;
19+
int nx = 2000, ny = 2000;
2020
int NBLK = std::ceil((float) nx / 32);
21-
int iter = 2;
21+
int iter = 100;
2222

2323
// naive
24-
thrust::device_vector<double> data_d_old(nx * ny, 0);
24+
thrust::device_vector<float> data_d_old(nx * ny, 0);
2525

2626
_initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old.data()),
2727
nx, ny);
2828

29-
thrust::device_vector<double> data_d_new = data_d_old;
29+
thrust::device_vector<float> data_d_new = data_d_old;
3030

3131
cudaDeviceSynchronize();
32-
naive::heat_diffusion<double, 16>(thrust::raw_pointer_cast(data_d_old.data()),
32+
naive::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old.data()),
3333
thrust::raw_pointer_cast(data_d_new.data()),
3434
nx, ny, iter);
35-
35+
cudaDeviceSynchronize();
3636
// Printing device vector
37-
std::cout << "\n Naive: \n";
38-
thrust::copy(data_d_new.begin(), data_d_new.end(),
39-
std::ostream_iterator<double>(std::cout, " "));
37+
// std::cout << "\n Naive: \n";
38+
// thrust::copy(data_d_new.begin(), data_d_new.end(),
39+
// std::ostream_iterator<double>(std::cout, " "));
4040

4141
// shared_global
42-
thrust::device_vector<double> data_d_old_sg(nx * ny, 0);
42+
thrust::device_vector<float> data_d_old_sg(nx * ny, 0);
4343

4444
_initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old_sg.data()),
4545
nx, ny);
4646

47-
thrust::device_vector<double> data_d_new_sg = data_d_old_sg;
47+
thrust::device_vector<float> data_d_new_sg = data_d_old_sg;
4848

4949
cudaDeviceSynchronize();
50-
shared_global::heat_diffusion<double, 16>(thrust::raw_pointer_cast(data_d_old_sg.data()),
50+
shared_global::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old_sg.data()),
5151
thrust::raw_pointer_cast(data_d_new_sg.data()),
5252
nx, ny, iter);
53+
cudaDeviceSynchronize();
54+
// Printing device vector
55+
// std::cout << "\n Shared & Global: \n";
56+
// thrust::copy(data_d_new_sg.begin(), data_d_new_sg.end(),
57+
// std::ostream_iterator<double>(std::cout, " "));
58+
59+
// shared_only
60+
thrust::device_vector<float> data_d_old_so(nx * ny, 0);
5361

62+
_initialize_temps<<<NBLK, 32>>> (thrust::raw_pointer_cast(data_d_old_so.data()),
63+
nx, ny);
64+
65+
thrust::device_vector<float> data_d_new_so = data_d_old_so;
66+
67+
cudaDeviceSynchronize();
68+
shared_only::heat_diffusion<float, 16>(thrust::raw_pointer_cast(data_d_old_so.data()),
69+
thrust::raw_pointer_cast(data_d_new_so.data()),
70+
nx, ny, iter);
71+
cudaDeviceSynchronize();
5472
// Printing device vector
55-
std::cout << "\n Shared & Global: \n";
56-
thrust::copy(data_d_new_sg.begin(), data_d_new_sg.end(),
57-
std::ostream_iterator<double>(std::cout, " "));
73+
// std::cout << "\n Shared Only: \n";
74+
// thrust::copy(data_d_new_so.begin(), data_d_new_so.end(),
75+
// std::ostream_iterator<double>(std::cout, " "));
5876

5977
return 0;
6078
}

0 commit comments

Comments
 (0)