PSCF v1.1
ParallelReductions.cu
1#ifndef PSPG_PARALLEL_REDUCTIONS_CU
2#define PSPG_PARALLEL_REDUCTIONS_CU
3
4#include "ParallelReductions.h"
5
6namespace Pscf {
7namespace Pspg {
8
9__global__ void reductionSum(cudaReal* sum, const cudaReal* in, int size)
10{
11 // number of blocks cut in two to avoid inactive initial threads
12 int tid = threadIdx.x;
13 int bid = blockIdx.x;
14 int bdim = blockDim.x;
15 int idx = bid * (blockDim.x*2) + tid;
16
17 if (idx < size) {
18 // Shared memory holding area
19 extern __shared__ cudaReal sdata[];
20
21 // Global memory load and first operation
22 cudaReal in0 = in[idx];
23 if (idx + bdim < size) {
24 cudaReal in1 = in[idx+bdim];
25 sdata[tid] = in0+in1;
26 } else {
27 sdata[tid] = in0;
28 }
29
30 // wait for all threads to finish
31 __syncthreads();
32
33 // Make comparisons across the block of data, each thread handling
34 // one comparison across two data points with strided indices before
35 // syncing with each other and then making further comparisons.
36 for (int stride = blockDim.x/2; stride > 0; stride/=2) {
37 if (tid < stride && idx+stride < size) {
38 sdata[tid] += sdata[tid+stride];
39 }
40 __syncthreads();
41 }
42
43 // Store the output of the threads in this block
44 if (tid == 0)
45 sum[bid] = sdata[0];
46 }
47}
48
49__global__ void reductionInnerProduct(cudaReal* innerprod, const cudaReal* a, const cudaReal* b, int size)
50{
51 // number of blocks cut in two to avoid inactive initial threads
52 int tid = threadIdx.x;
53 int bid = blockIdx.x;
54 int bdim = blockDim.x;
55 int idx = bid * bdim*2 + tid;
56
57 if (idx < size) {
58 // Shared memory holding area
59 extern __shared__ cudaReal sdata[];
60
61 // Global memory load and first operation
62 cudaReal in0 = a[idx]*b[idx];
63 if (idx + bdim < size) {
64 cudaReal in1 = a[idx+bdim]*b[idx+bdim];
65 sdata[tid] = in0 + in1;
66 } else {
67 sdata[tid] = in0;
68 }
69
70
71 // wait for all threads to finish
72 __syncthreads();
73
74 // Make comparisons across the block of data, each thread handling
75 // one comparison across two data points with strided indices before
76 // syncing with each other and then making further comparisons.
77 for (int stride = blockDim.x/2; stride > 0; stride/=2) {
78 if (tid < stride && idx+stride < size) {
79 sdata[tid] += sdata[tid+stride];
80 }
81 __syncthreads();
82 }
83
84 // Store the output of the threads in this block
85 if (tid == 0) {
86 innerprod[bid] = sdata[0];
87 }
88 }
89}
90
91__global__ void reductionMax(cudaReal* max, const cudaReal* in, int size)
92{
93 // number of blocks cut in two to avoid inactive initial threads
94 int tid = threadIdx.x;
95 int bid = blockIdx.x;
96 int bdim = blockDim.x;
97 int idx = bid * (blockDim.x*2) + tid;
98
99 if (idx < size) {
100 // Shared memory holding area
101 extern __shared__ cudaReal sdata[];
102
103 // Global memory load and first operation
104 cudaReal in0 = in[idx];
105 if (idx + bdim < size) {
106 cudaReal in1 = in[idx+bdim];
107 sdata[tid] = (in0 > in1) ? in0 : in1;
108 } else {
109 sdata[tid] = in0;
110 }
111
112 // wait for all threads to finish
113 __syncthreads();
114
115 // Make comparisons across the block of data, each thread handling
116 // one comparison across two data points with strided indices before
117 // syncing with each other and then making further comparisons.
118 for (int stride = blockDim.x/2; stride > 0; stride/=2) {
119 if (tid < stride && idx+stride < size) {
120 if (sdata[tid+stride] > sdata[tid]) {
121 sdata[tid] = sdata[tid+stride];
122 }
123 }
124 __syncthreads();
125 }
126
127 // Store the output of the threads in this block
128 if (tid == 0)
129 max[bid] = sdata[0];
130 }
131}
132
133__global__ void reductionMaxAbs(cudaReal* max, const cudaReal* in, int size)
134{
135 // number of blocks cut in two to avoid inactive initial threads
136 int tid = threadIdx.x;
137 int bid = blockIdx.x;
138 int bdim = blockDim.x;
139 int idx = bid * (blockDim.x*2) + tid;
140
141 if (idx < size) {
142 // Shared memory holding area
143 extern __shared__ cudaReal sdata[];
144
145 // Global memory load and first operation
146 cudaReal in0 = fabs(in[idx]);
147 if (idx + bdim < size) {
148 cudaReal in1 = fabs(in[idx+bdim]);
149 sdata[tid] = (in0 > in1) ? in0 : in1;
150 } else {
151 sdata[tid] = in0;
152 }
153
154 // wait for all threads to finish
155 __syncthreads();
156
157 // reduction
158 // data and block dimensions need to be a power of two
159 for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
160 if (tid < stride && idx+stride < size) {
161 if (sdata[tid+stride] > sdata[tid]) {
162 sdata[tid] = sdata[tid+stride];
163 }
164 }
165 __syncthreads();
166 }
167
168 // one thread for each block stores that block's results in global memory
169 if (tid == 0)
170 max[bid] = sdata[0];
171 }
172}
173
174__global__ void reductionMin(cudaReal* min, const cudaReal* in, int size)
175{
176 // number of blocks cut in two to avoid inactive initial threads
177 int tid = threadIdx.x;
178 int bid = blockIdx.x;
179 int bdim = blockDim.x;
180 int idx = bid * (blockDim.x*2) + tid;
181
182 if (idx < size) {
183 // Shared memory holding area
184 extern __shared__ cudaReal sdata[];
185
186 // Global memory load and first operation
187 cudaReal in0 = in[idx];
188 if (idx + bdim < size) {
189 cudaReal in1 = in[idx+bdim];
190 sdata[tid] = (in0 < in1) ? in0 : in1;
191 } else {
192 sdata[tid] = in0;
193 }
194
195 // wait for all threads to finish
196 __syncthreads();
197
198 // Make comparisons across the block of data, each thread handling
199 // one comparison across two data points with strided indices before
200 // syncing with each other and then making further comparisons.
201 for (int stride = blockDim.x/2; stride > 0; stride/=2) {
202 if (tid < stride && idx+stride < size) {
203 if (sdata[tid+stride] < sdata[tid]) {
204 sdata[tid] = sdata[tid+stride];
205 }
206 }
207 __syncthreads();
208 }
209
210 // Store the output of the threads in this block
211 if (tid == 0) {
212 min[bid] = sdata[0];
213 }
214 }
215}
216
217__global__ void reductionMinAbs(cudaReal* min, const cudaReal* in, int size)
218{
219 // number of blocks cut in two to avoid inactive initial threads
220 int tid = threadIdx.x;
221 int bid = blockIdx.x;
222 int bdim = blockDim.x;
223 int idx = bid * (blockDim.x*2) + tid;
224
225 if (idx < size) {
226 // Shared memory holding area
227 extern __shared__ cudaReal sdata[];
228
229 // Global memory load and first operation
230 cudaReal in0 = fabs(in[idx]);
231 if (idx + bdim < size) {
232 cudaReal in1 = fabs(in[idx+bdim]);
233 sdata[tid] = (in0 < in1) ? in0 : in1;
234 } else {
235 sdata[tid] = in0;
236 }
237
238 // wait for all threads to finish
239 __syncthreads();
240
241 // Make comparisons across the block of data, each thread handling
242 // one comparison across two data points with strided indices before
243 // syncing with each other and then making further comparisons.
244 for (int stride = blockDim.x/2; stride > 0; stride/=2) {
245 if (tid < stride && idx+stride < size) {
246 if (sdata[tid+stride] < sdata[tid]) {
247 sdata[tid] = sdata[tid+stride];
248 }
249 }
250 __syncthreads();
251 }
252
253 // Store the output of the threads in this block
254 if (tid == 0) {
255 min[bid] = sdata[0];
256 }
257 }
258}
259
260}
261}
262#endif
C++ namespace for polymer self-consistent field theory (PSCF).