Skip to content

Commit 1335738

Browse files
committedOct 13, 2020
adding comments for count kernels
1 parent 145d8c2 commit 1335738

File tree

2 files changed

+20
-9
lines changed

2 files changed

+20
-9
lines changed
 

‎cpp/include/learning_cuda/count/count_kernels.cuh

+14-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
1111
int tid = threadIdx.x + blockIdx.x * blockDim.x;
1212

1313
if (tid < size) {
14-
14+
// global memory, hence atomics
1515
if (count_if_op(arr[tid])) {
1616
atomicAdd(count, 1);
1717
}
@@ -32,6 +32,7 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
3232
__shared__ int local_count_array[TPB];
3333

3434
if (tid < size) {
35+
// set if op in shared memory
3536
if (count_if_op(arr[tid])) {
3637
local_count_array[threadIdx.x] = 1;
3738
}
@@ -41,6 +42,9 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
4142

4243
__syncthreads();
4344

45+
// manual reduction within block between first half
46+
// and second half
47+
// note format of reduction to enable SIMD
4448
for (int offset = blockDim.x / 2; offset > 0; offset >>=1 ) {
4549
if (threadIdx.x < offset && tid + offset < size) {
4650
local_count_array[threadIdx.x] += local_count_array[threadIdx.x + offset];
@@ -67,6 +71,7 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
6771

6872
bool predicate = tid < size && count_if_op(arr[tid]);
6973

74+
// block level primitive
7075
int block_count = __syncthreads_count(predicate);
7176

7277
if (threadIdx.x == 0) {
@@ -90,26 +95,30 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
9095

9196
bool predicate = tid < size && count_if_op(arr[tid]);
9297

98+
// find participating warps in predicate
99+
// FULL_MASK is 32 bit set
93100
unsigned ballot_mask = __ballot_sync(FULL_MASK, predicate);
94-
int warp_count = __popc(ballot_mask);
101+
int warp_count = __popc(ballot_mask); // counts set bits
95102

96-
// global atomics
97-
// if(threadIdx.x == 0) {
103+
// global memory atomics
104+
// if (threadIdx.x % 32 == 0) {
98105
// atomicAdd(count, warp_count);
99106
// }
100107

101-
108+
// shared memory of size number of warps per block
102109
// optimization for block reduction
103110
__shared__ int block_counts[TPB / 32];
104111

105112
int warp_id = threadIdx.x / 32;
106113
int lane_id = threadIdx.x % 32;
107114
if (lane_id == 0) {
115+
// set local warp count in smem
108116
block_counts[warp_id] = warp_count;
109117
}
110118

111119
__syncthreads();
112120

121+
// recall reduction from earlier
113122
for (int offset = (TPB / 32) / 2; offset > 0; offset >>= 1) {
114123
if (lane_id == 0 && warp_id < offset) {
115124
block_counts[warp_id] += block_counts[warp_id + offset];

‎cpp/src/count/count.cu

+6-4
Original file line numberDiff line numberDiff line change
@@ -36,27 +36,29 @@ int main() {
3636

3737
int *rand_d_ptr = thrust::raw_pointer_cast(rand_d.data());
3838

39+
const int TPB = 128;
40+
3941
cudaDeviceSynchronize();
40-
int naive_count = naive::count_if<int, 64>(rand_d_ptr, n_elems,
42+
int naive_count = naive::count_if<int, TPB>(rand_d_ptr, n_elems,
4143
is_greater_than_10);
4244

4345
std::cout << "\nNaive Count: " << naive_count << std::endl;
4446

4547
cudaDeviceSynchronize();
46-
int man_count = manual_reduction::count_if<int, 64>(rand_d_ptr, n_elems,
48+
int man_count = manual_reduction::count_if<int, TPB>(rand_d_ptr, n_elems,
4749
is_greater_than_10);
4850

4951
std::cout << "\nManual Reduction Count: " << man_count << std::endl;
5052

5153
cudaDeviceSynchronize();
52-
int syn_count = syncthreads_count_reduction::count_if<int, 64>(rand_d_ptr,
54+
int syn_count = syncthreads_count_reduction::count_if<int, TPB>(rand_d_ptr,
5355
n_elems,
5456
is_greater_than_10);
5557

5658
std::cout << "\nSyncthread Reduction Count: " << syn_count << std::endl;
5759

5860
cudaDeviceSynchronize();
59-
int bal_count = ballot_sync_reduction::count_if<int, 64>(rand_d_ptr,
61+
int bal_count = ballot_sync_reduction::count_if<int, TPB>(rand_d_ptr,
6062
n_elems,
6163
is_greater_than_10);
6264

0 commit comments

Comments
 (0)