Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 14f2e8a

Browse files
committedSep 27, 2020
all working kernels, may be some spilling in ballot count edge case
1 parent 70569d3 commit 14f2e8a

File tree

2 files changed

+44
-13
lines changed

2 files changed

+44
-13
lines changed
 

‎cpp/include/learning_cuda/count/count_kernels.cuh

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ __global__
2929
void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
3030
int tid = threadIdx.x + blockIdx.x * blockDim.x;
3131

32-
__shared__ unsigned int local_count_array[TPB];
32+
__shared__ int local_count_array[TPB];
3333

3434
if (tid < size) {
3535
if (count_if_op(arr[tid])) {
@@ -44,11 +44,11 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
4444
for (int offset = blockDim.x / 2; offset > 0; offset >>=1 ) {
4545
if (threadIdx.x < offset && tid + offset < size) {
4646
local_count_array[threadIdx.x] += local_count_array[threadIdx.x + offset];
47-
__syncthreads();
4847
}
48+
__syncthreads();
4949
}
5050

51-
if(threadIdx.x == 0) {
51+
if (threadIdx.x == 0) {
5252
atomicAdd(count, local_count_array[threadIdx.x]);
5353
}
5454
}
@@ -69,7 +69,7 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
6969

7070
int block_count = __syncthreads_count(predicate);
7171

72-
if(threadIdx.x == 0) {
72+
if (threadIdx.x == 0) {
7373
atomicAdd(count, block_count);
7474
}
7575

@@ -93,8 +93,34 @@ void count_kernel(T* arr, int size, int *count, CountIfOp count_if_op) {
9393
unsigned ballot_mask = __ballot_sync(FULL_MASK, predicate);
9494
int warp_count = __popc(ballot_mask);
9595

96-
if(threadIdx.x % 32 == 0) {
97-
atomicAdd(count, warp_count);
96+
// global atomics
97+
// if(threadIdx.x == 0) {
98+
// atomicAdd(count, warp_count);
99+
// }
100+
101+
102+
// optimization for block reduction
103+
__shared__ int block_counts[TPB / 32];
104+
105+
if (tid < size) {
106+
int warp_id = threadIdx.x / 32;
107+
int lane_id = threadIdx.x % 32;
108+
if (lane_id == 0) {
109+
block_counts[warp_id] = warp_count;
110+
}
111+
112+
__syncthreads();
113+
114+
for (int offset = (TPB / 32) / 2; offset > 0; offset >>= 1) {
115+
if (lane_id == 0 && warp_id < offset && tid + offset < size) {
116+
block_counts[warp_id] += block_counts[warp_id + offset];
117+
}
118+
__syncthreads();
119+
}
120+
121+
if(threadIdx.x == 0) {
122+
atomicAdd(count, block_counts[threadIdx.x]);
123+
}
98124
}
99125

100126
}

‎cpp/src/count/count.cu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313

1414
int main() {
15-
int n_elems = 100;
15+
int n_elems = 10000000;
1616

1717
thrust::device_vector<int> rand_d(n_elems, 0);
1818

@@ -28,30 +28,35 @@ int main() {
2828
return (x > 10 ? true : false);
2929
};
3030

31+
cudaDeviceSynchronize();
3132
int thrust_count = thrust::count_if(thrust::device, rand_d.begin(),
3233
rand_d.end(), is_greater_than_10);
3334

3435
std::cout << "\nThrust Count: " << thrust_count << std::endl;
3536

3637
int *rand_d_ptr = thrust::raw_pointer_cast(rand_d.data());
3738

38-
int naive_count = naive::count_if<int, 32>(rand_d_ptr, n_elems,
39-
is_greater_than_10);
39+
cudaDeviceSynchronize();
40+
int naive_count = naive::count_if<int, 64>(rand_d_ptr, n_elems,
41+
is_greater_than_10);
4042

4143
std::cout << "\nNaive Count: " << naive_count << std::endl;
4244

43-
int man_count = manual_reduction::count_if<int, 32>(rand_d_ptr, n_elems,
44-
is_greater_than_10);
45+
cudaDeviceSynchronize();
46+
int man_count = manual_reduction::count_if<int, 64>(rand_d_ptr, n_elems,
47+
is_greater_than_10);
4548

4649
std::cout << "\nManual Reduction Count: " << man_count << std::endl;
4750

48-
int syn_count = syncthreads_count_reduction::count_if<int, 32>(rand_d_ptr,
51+
cudaDeviceSynchronize();
52+
int syn_count = syncthreads_count_reduction::count_if<int, 64>(rand_d_ptr,
4953
n_elems,
5054
is_greater_than_10);
5155

5256
std::cout << "\nSyncthread Reduction Count: " << syn_count << std::endl;
5357

54-
int bal_count = ballot_sync_reduction::count_if<int, 32>(rand_d_ptr,
58+
cudaDeviceSynchronize();
59+
int bal_count = ballot_sync_reduction::count_if<int, 64>(rand_d_ptr,
5560
n_elems,
5661
is_greater_than_10);
5762

0 commit comments

Comments
 (0)
Please sign in to comment.