PSCF v1.2
Reduce.cu
1/*
2* PSCF Package
3*
4* Copyright 2016 - 2022, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "Reduce.h"
9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/HostDArray.h>
11#include <pscf/cuda/cudaErrorCheck.h>
12#include <cmath>
13
14namespace Pscf {
15namespace Prdc {
16namespace Cuda {
17namespace Reduce {
18
19// CUDA kernels:
20// (defined in anonymous namespace, used within this file only)
21
22namespace {
23
24 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 // Parallel reduction: single-warp functions
26 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27
28 /*
29 * The reduction algorithm can be simplified during the last 6
30 * levels of reduction, because these levels of reduction are
31 * performed by a single warp. Within a single warp, each thread
32 * executes the same instruction at the same time (SIMD execution).
33 * Therefore, we don't need the __syncthreads() command between
34 * reduction operations. Further, we do not need to evaluate an
35 * if-statement to determine which threads should perform the
36 * calculation and which should not, since the entire warp will be
37 * dedicated to these operations regardless of whether they perform
38 * calculations. Therefore, the if-statement would not free any
39 * resources for other tasks, so we omit it for speed.
40 *
41 * We assume here that a single warp contains 32 threads. All
42 * CUDA-compatible GPUs currently meet this criterion, but it is
43 * possible that someday there will be GPUs with a different warp
44 * size. The methods below may break if the warp size is smaller
45 * than 32 threads, because the operations would be performed by
46 * multiple warps without __syncthreads() commands to keep them
47 * synced. Warps larger than 32 threads would still be compatible
48 * with these functions, though the functions are not optimized
49 * for this case.
50 *
51 * These are implemented as separate functions, rather than within
52 * the kernels above, because they require the sData array to be
53 * defined as volatile (meaning the array values may change at any
54 * time, so the compiler must access the actual memory location
55 * rather than using cached values).
56 */
57
58 /*
59 * Utility to perform parallel reduction summation within a single warp.
60 *
61 * \param sData input array to reduce
62 * \param tId thread ID
63 */
64 __device__ void _warpSum(volatile cudaReal* sData, int tId)
65 {
66 sData[tId] += sData[tId + 32];
67 sData[tId] += sData[tId + 16];
68 sData[tId] += sData[tId + 8];
69 sData[tId] += sData[tId + 4];
70 sData[tId] += sData[tId + 2];
71 sData[tId] += sData[tId + 1];
72 }
73
74 /*
75 * Utility to perform parallel reduction maximization within a single warp.
76 *
77 * \param sData input array to reduce
78 * \param tId thread ID
79 */
80 __device__ void _warpMax(volatile cudaReal* sData, int tId)
81 {
82 if (sData[tId + 32] > sData[tId]) sData[tId] = sData[tId + 32];
83 if (sData[tId + 16] > sData[tId]) sData[tId] = sData[tId + 16];
84 if (sData[tId + 8] > sData[tId]) sData[tId] = sData[tId + 8];
85 if (sData[tId + 4] > sData[tId]) sData[tId] = sData[tId + 4];
86 if (sData[tId + 2] > sData[tId]) sData[tId] = sData[tId + 2];
87 if (sData[tId + 1] > sData[tId]) sData[tId] = sData[tId + 1];
88 }
89
90 /*
91 * Utility to perform parallel reduction minimization within a single warp.
92 *
93 * \param sData input array to reduce
94 * \param tId thread ID
95 */
96 __device__ void _warpMin(volatile cudaReal* sData, int tId)
97 {
98 if (sData[tId + 32] < sData[tId]) sData[tId] = sData[tId + 32];
99 if (sData[tId + 16] < sData[tId]) sData[tId] = sData[tId + 16];
100 if (sData[tId + 8] < sData[tId]) sData[tId] = sData[tId + 8];
101 if (sData[tId + 4] < sData[tId]) sData[tId] = sData[tId + 4];
102 if (sData[tId + 2] < sData[tId]) sData[tId] = sData[tId + 2];
103 if (sData[tId + 1] < sData[tId]) sData[tId] = sData[tId + 1];
104 }
105
106 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
107 // Parallel reduction: full kernels
108 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
109
110 /*
111 * Compute sum of array elements (GPU kernel).
112 *
113 * Assumes each warp is 32 threads.
114 * Assumes that each block contains at least 64 threads.
115 * Assumes that the block size is a power of 2.
116 *
117 * \param sum reduced array containing the sum from each thread block
118 * \param in input array
119 * \param n number of input array elements
120 */
121 __global__ void _sum(cudaReal* sum, const cudaReal* in, int n)
122 {
123 // number of blocks cut in two to avoid inactive initial threads
124 int tId = threadIdx.x;
125 int bId = blockIdx.x;
126 int bDim = blockDim.x;
127 int idx = bId * (bDim*2) + tId;
128
129 // Shared memory holding area
130 extern __shared__ cudaReal sData[];
131
132 // Global memory load and first operation
133 if (idx < n) {
134 sData[tId] = in[idx];
135 if (idx + bDim < n) {
136 sData[tId] += in[idx+bDim];
137 }
138 } else {
139 // idx > n. Set value to 0.0 to fully populate sData without
140 // contributing to the sum
141 sData[tId] = 0.0;
142 }
143
144 // wait for all threads to finish
145 __syncthreads();
146
147 // Make reductions across the block of data, each thread handling
148 // one reduction across two data points with strided indices before
149 // syncing with each other and then making further reductions.
150 for (int stride = bDim / 2; stride > 32; stride /= 2) {
151 if (tId < stride) {
152 sData[tId] += sData[tId+stride];
153 }
154 __syncthreads();
155 }
156
157 // Unwrap last warp (stride == 32)
158 if (tId < 32) {
159 _warpSum(sData, tId); // defined at bottom of this file
160 }
161
162 // Store the output of the threads in this block
163 if (tId == 0) {
164 sum[bId] = sData[0];
165 }
166 }
167
168 /*
169 * Get maximum of array elements (GPU kernel).
170 *
171 * Assumes each warp is 32 threads.
172 * Assumes that each block contains at least 64 threads.
173 * Assumes that the block size is a power of 2.
174 *
175 * \param max reduced array containing the max from each thread block
176 * \param in input array
177 * \param n number of input array elements
178 */
179 __global__ void _max(cudaReal* max, const cudaReal* in, int n)
180 {
181 // number of blocks cut in two to avoid inactive initial threads
182 int tId = threadIdx.x;
183 int bId = blockIdx.x;
184 int bDim = blockDim.x;
185 int idx = bId * (bDim*2) + tId;
186
187 // Shared memory holding area
188 extern __shared__ cudaReal sData[];
189
190 // Global memory load and first operation
191 if (idx < n) {
192 sData[tId] = in[idx];
193 if (idx + bDim < n) {
194 cudaReal in1 = in[idx+bDim];
195 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
196 }
197 } else {
198 // idx > n. Set value to in[idx-n], an earlier value in the
199 // array, to fully populate sData without altering the result
200 sData[tId] = in[idx-n];
201 }
202
203 // wait for all threads to finish
204 __syncthreads();
205
206 // Make reductions across the block of data, each thread handling
207 // one reduction across two data points with strided indices before
208 // syncing with each other and then making further reductions.
209 for (int stride = bDim/2; stride > 32; stride/=2) {
210 if (tId < stride) {
211 if (sData[tId+stride] > sData[tId]) {
212 sData[tId] = sData[tId+stride];
213 }
214 }
215 __syncthreads();
216 }
217
218 // Unwrap last warp (stride == 32)
219 if (tId < 32) {
220 _warpMax(sData, tId); // defined at bottom of this file
221 }
222
223 // Store the output of the threads in this block
224 if (tId == 0) {
225 max[bId] = sData[0];
226 }
227 }
228
229 /*
230 * Get maximum absolute magnitude of array elements (GPU kernel).
231 *
232 * Assumes each warp is 32 threads.
233 * Assumes that each block contains at least 64 threads.
234 * Assumes that the block size is a power of 2.
235 *
236 * \param max reduced array containing the max from each thread block
237 * \param in input array
238 * \param n number of input array elements
239 */
240 __global__ void _maxAbs(cudaReal* max, const cudaReal* in, int n)
241 {
242 // number of blocks cut in two to avoid inactive initial threads
243 int tId = threadIdx.x;
244 int bId = blockIdx.x;
245 int bDim = blockDim.x;
246 int idx = bId * (bDim*2) + tId;
247
248 // Shared memory holding area
249 extern __shared__ cudaReal sData[];
250
251 // Global memory load and first operation
252 if (idx < n) {
253 sData[tId] = fabs(in[idx]);
254 if (idx + bDim < n) {
255 cudaReal in1 = fabs(in[idx+bDim]);
256 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
257 }
258 } else {
259 // idx > n. Set value to 0.0 to fully populate sData without
260 // altering the result
261 sData[tId] = 0.0;
262 }
263
264 // wait for all threads to finish
265 __syncthreads();
266
267 // Make reductions across the block of data, each thread handling
268 // one reduction across two data points with strided indices before
269 // syncing with each other and then making further reductions.
270 for (int stride = bDim/2; stride > 32; stride/=2) {
271 if (tId < stride) {
272 if (sData[tId+stride] > sData[tId]) {
273 sData[tId] = sData[tId+stride];
274 }
275 }
276 __syncthreads();
277 }
278
279 // Unwrap last warp (stride == 32)
280 if (tId < 32) {
281 _warpMax(sData, tId); // defined at bottom of this file
282 }
283
284 // Store the output of the threads in this block
285 if (tId == 0) {
286 max[bId] = sData[0];
287 }
288 }
289
290 /*
291 * Get minimum of array elements (GPU kernel).
292 *
293 * Assumes each warp is 32 threads.
294 * Assumes that each block contains at least 64 threads.
295 * Assumes that the block size is a power of 2.
296 *
297 * \param min reduced array containing the min from each thread block
298 * \param in input array
299 * \param n number of input array elements
300 */
301 __global__ void _min(cudaReal* min, const cudaReal* in, int n)
302 {
303 // number of blocks cut in two to avoid inactive initial threads
304 int tId = threadIdx.x;
305 int bId = blockIdx.x;
306 int bDim = blockDim.x;
307 int idx = bId * (bDim*2) + tId;
308
309 // Shared memory holding area
310 extern __shared__ cudaReal sData[];
311
312 // Global memory load and first operation
313 if (idx < n) {
314 sData[tId] = in[idx];
315 if (idx + bDim < n) {
316 cudaReal in1 = in[idx+bDim];
317 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
318 }
319 } else {
320 // idx > n. Set value to in[idx-n], an earlier value in the
321 // array, to fully populate sData without altering the result
322 sData[tId] = in[idx-n];
323 }
324
325 // wait for all threads to finish
326 __syncthreads();
327
328 // Make reductions across the block of data, each thread handling
329 // one reduction across two data points with strided indices before
330 // syncing with each other and then making further reductions.
331 for (int stride = bDim/2; stride > 32; stride/=2) {
332 if (tId < stride) {
333 if (sData[tId+stride] < sData[tId]) {
334 sData[tId] = sData[tId+stride];
335 }
336 }
337 __syncthreads();
338 }
339
340 // Unwrap last warp (stride == 32)
341 if (tId < 32) {
342 _warpMin(sData, tId); // defined at bottom of this file
343 }
344
345 // Store the output of the threads in this block
346 if (tId == 0) {
347 min[bId] = sData[0];
348 }
349 }
350
351 /*
352 * Get minimum absolute magnitude of array elements (GPU kernel).
353 *
354 * Assumes each warp is 32 threads.
355 * Assumes that each block contains at least 64 threads.
356 * Assumes that the block size is a power of 2.
357 *
358 * \param min reduced array containing the min from each thread block
359 * \param in input array
360 * \param n number of input array elements
361 */
362 __global__ void _minAbs(cudaReal* min, const cudaReal* in, int n)
363 {
364 // number of blocks cut in two to avoid inactive initial threads
365 int tId = threadIdx.x;
366 int bId = blockIdx.x;
367 int bDim = blockDim.x;
368 int idx = bId * (bDim*2) + tId;
369
370 // Shared memory holding area
371 extern __shared__ cudaReal sData[];
372
373 // Global memory load and first operation
374 if (idx < n) {
375 sData[tId] = fabs(in[idx]);
376 if (idx + bDim < n) {
377 cudaReal in1 = fabs(in[idx+bDim]);
378 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
379 }
380 } else {
381 // idx > n. Set value to fabs(in[idx-n]), an earlier value in the
382 // array, to fully populate sData without altering the result
383 sData[tId] = fabs(in[idx-n]);
384 }
385
386 // wait for all threads to finish
387 __syncthreads();
388
389 // Make reductions across the block of data, each thread handling
390 // one reduction across two data points with strided indices before
391 // syncing with each other and then making further reductions.
392 for (int stride = bDim/2; stride > 32; stride/=2) {
393 if (tId < stride) {
394 if (sData[tId+stride] < sData[tId]) {
395 sData[tId] = sData[tId+stride];
396 }
397 }
398 __syncthreads();
399 }
400
401 // Unwrap last warp (stride == 32)
402 if (tId < 32) {
403 _warpMin(sData, tId); // defined at bottom of this file
404 }
405
406 // Store the output of the threads in this block
407 if (tId == 0) {
408 min[bId] = sData[0];
409 }
410 }
411
412 /*
413 * Compute inner product of two real arrays (GPU kernel).
414 *
415 * Assumes each warp is 32 threads.
416 * Assumes that each block contains at least 64 threads.
417 * Assumes that the block size is a power of 2.
418 *
419 * \param ip reduced array containing the inner prod from each thread block
420 * \param a first input array
421 * \param b second input array
422 * \param n number of input array elements
423 */
424 __global__ void _innerProduct(cudaReal* ip, const cudaReal* a,
425 const cudaReal* b, int n)
426 {
427 // number of blocks cut in two to avoid inactive initial threads
428 int tId = threadIdx.x;
429 int bId = blockIdx.x;
430 int bDim = blockDim.x;
431 int idx = bId * (bDim*2) + tId;
432
433 // Shared memory holding area
434 extern __shared__ cudaReal sData[];
435
436 // Global memory load and first operation
437 if (idx < n) {
438 sData[tId] = a[idx] * b[idx];
439 if (idx + bDim < n) {
440 sData[tId] += (a[idx+bDim] * b[idx+bDim]);
441 }
442 } else {
443 // idx > n. Set value to 0.0 to fully populate sData without
444 // contributing to the sum
445 sData[tId] = 0.0;
446 }
447
448 // wait for all threads to finish
449 __syncthreads();
450
451 // Make reductions across the block of data, each thread handling
452 // one reduction across two data points with strided indices before
453 // syncing with each other and then making further reductions.
454 for (int stride = bDim / 2; stride > 32; stride /= 2) {
455 if (tId < stride) {
456 sData[tId] += sData[tId+stride];
457 }
458 __syncthreads();
459 }
460
461 // Unwrap last warp (stride == 32)
462 if (tId < 32) {
463 _warpSum(sData, tId); // defined at bottom of this file
464 }
465
466 // Store the output of the threads in this block
467 if (tId == 0) {
468 ip[bId] = sData[0];
469 }
470 }
471
472}
473
474
475// CUDA kernel wrappers:
476
477/*
478* Compute sum of array elements (GPU kernel wrapper).
479*/
480cudaReal sum(DeviceArray<cudaReal> const & in)
481{
483 int n = in.capacity();
484
485 // Set up temporary device arrays for storing reduced data
486 DeviceArray<cudaReal> temp1, temp2;
487
488 int i = 0;
489
490 // Perform parallel reduction on GPU repeatedly until n < 1e5
491 while (n >= 1e5) {
492 // Establish GPU resources for this parallel reduction. Divided by
493 // two because of the global memory load in the kernel performing
494 // the first level of reduction!
495 int nBlocks, nThreads;
496 int halvedSize = ceil((float)n/2);
497
498 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
499 // Note: setThreadsLogical ensures that nThreads is a power of 2
500
501 if (nThreads < 64) {
502 // Thread blocks too small. Manually set nThreads to 64
504 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
505
506 // If the above was successful, print warning
507 Log::file() << "Warning: "
508 << "nThreads too small for parallel reduction.\n"
509 << "Setting nThreads equal to 64." << std::endl;
510 }
511
512 // Warp size must be 32
514
515 // Perform parallel reduction
516 if (i == 0) { // first reduction, use input array
517 temp1.allocate(nBlocks);
518 _sum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
519 (temp1.cArray(), in.cArray(), n);
520 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
521 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
522 temp2.allocate(nBlocks);
523 _sum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
524 (temp2.cArray(), temp1.cArray(), n);
525 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
526 temp1.deallocate();
527 } else { // i is even: reduce temp2, store in temp1
528 temp1.allocate(nBlocks);
529 _sum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
530 (temp1.cArray(), temp2.cArray(), n);
531 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
532 temp2.deallocate();
533 }
534
535 n = nBlocks;
536 i += 1;
537 }
538
539 // Transfer the partially reduced sum to the host
541 if (i == 0) {
542 temp_h = in;
543 } else if (i % 2 == 1) {
544 temp_h = temp1;
545 } else {
546 temp_h = temp2;
547 }
548
549 if (n == 1) {
550 return temp_h[0];
551 } else {
552 // Sum up elements of temp_h to get final result.
553 // Use Kahan summation to reduce accumulation of error
554 cudaReal sum = 0.0, tempVal, tempSum;
555 cudaReal err = 0.0;
556 for (int i = 0; i < n; ++i) {
557 tempVal = temp_h[i] - err;
558 tempSum = sum + tempVal;
559 err = tempSum - sum - tempVal;
560 sum = tempSum;
561 }
562 return sum;
563 }
564}
565
566/*
567* Get maximum of array elements (GPU kernel wrapper).
568*/
569cudaReal max(DeviceArray<cudaReal> const & in)
570{
572 int n = in.capacity();
573
574 // Set up temporary device arrays for storing reduced data
575 DeviceArray<cudaReal> temp1, temp2;
576
577 int i = 0;
578
579 // Perform parallel reduction on GPU repeatedly until n < 1e5
580 while (n >= 1e5) {
581 // Establish GPU resources for this parallel reduction. Divided by
582 // two because of the global memory load in the kernel performing
583 // the first level of reduction!
584 int nBlocks, nThreads;
585 int halvedSize = ceil((float)n/2);
586
587 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
588 // Note: setThreadsLogical ensures that nThreads is a power of 2
589
590 if (nThreads < 64) {
591 // Thread blocks too small. Manually set nThreads to 64
593 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
594
595 // If the above was successful, print warning
596 Log::file() << "Warning: "
597 << "nThreads too small for parallel reduction.\n"
598 << "Setting nThreads equal to 64." << std::endl;
599 }
600
601 // Warp size must be 32
603
604 // Perform parallel reduction
605 if (i == 0) { // first reduction, use input array
606 temp1.allocate(nBlocks);
607 _max<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
608 (temp1.cArray(), in.cArray(), n);
609 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
610 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
611 temp2.allocate(nBlocks);
612 _max<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
613 (temp2.cArray(), temp1.cArray(), n);
614 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
615 temp1.deallocate();
616 } else { // i is even: reduce temp2, store in temp1
617 temp1.allocate(nBlocks);
618 _max<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
619 (temp1.cArray(), temp2.cArray(), n);
620 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
621 temp2.deallocate();
622 }
623
624 n = nBlocks;
625 i += 1;
626 }
627
628 // Transfer the partially reduced sum to the host
630 if (i == 0) {
631 temp_h = in;
632 } else if (i % 2 == 1) {
633 temp_h = temp1;
634 } else {
635 temp_h = temp2;
636 }
637
638 cudaReal max = temp_h[0];
639 for (int i = 1; i < n; i++) {
640 if (temp_h[i] > max) max = temp_h[i];
641 }
642 return max;
643}
644
645/*
646* Get maximum absolute magnitude of array elements (GPU kernel wrapper).
647*/
648cudaReal maxAbs(DeviceArray<cudaReal> const & in)
649{
651 int n = in.capacity();
652
653 // Set up temporary device arrays for storing reduced data
654 DeviceArray<cudaReal> temp1, temp2;
655
656 int i = 0;
657
658 // Perform parallel reduction on GPU repeatedly until n < 1e5
659 // (note: for this wrapper, we always call the kernel at least once,
660 // even if n < 1e5, so that the part done on the CPU is always just
661 // comparing array element size, without needing fabs().)
662 while (n >= 1e5 || i == 0) {
663 // Establish GPU resources for this parallel reduction. Divided by
664 // two because of the global memory load in the kernel performing
665 // the first level of reduction!
666 int nBlocks, nThreads;
667 int halvedSize = ceil((float)n/2);
668
669 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
670 // Note: setThreadsLogical ensures that nThreads is a power of 2
671
672 if (nThreads < 64) {
673 // Thread blocks too small. Manually set nThreads to 64
675 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
676
677 // If the above was successful, print warning
678 Log::file() << "Warning: "
679 << "nThreads too small for parallel reduction.\n"
680 << "Setting nThreads equal to 64." << std::endl;
681 }
682
683 // Warp size must be 32
685
686 // Perform parallel reduction
687 if (i == 0) { // first reduction, use input array
688 temp1.allocate(nBlocks);
689 _maxAbs<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
690 (temp1.cArray(), in.cArray(), n);
691 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
692 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
693 temp2.allocate(nBlocks);
694 _max<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
695 (temp2.cArray(), temp1.cArray(), n);
696 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
697 temp1.deallocate();
698 } else { // i is even: reduce temp2, store in temp1
699 temp1.allocate(nBlocks);
700 _max<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
701 (temp1.cArray(), temp2.cArray(), n);
702 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
703 temp2.deallocate();
704 }
705
706 n = nBlocks;
707 i += 1;
708 }
709
710 // Transfer the partially reduced sum to the host
712 if (i == 0) {
713 temp_h = in;
714 } else if (i % 2 == 1) {
715 temp_h = temp1;
716 } else {
717 temp_h = temp2;
718 }
719
720 cudaReal max = temp_h[0];
721 for (int i = 1; i < n; i++) {
722 if (temp_h[i] > max) max = temp_h[i];
723 }
724 return max;
725}
726
727/*
728* Get minimum of array elements (GPU kernel wrapper).
729*/
730cudaReal min(DeviceArray<cudaReal> const & in)
731{
733 int n = in.capacity();
734
735 // Set up temporary device arrays for storing reduced data
736 DeviceArray<cudaReal> temp1, temp2;
737
738 int i = 0;
739
740 // Perform parallel reduction on GPU repeatedly until n < 1e5
741 while (n >= 1e5) {
742 // Establish GPU resources for this parallel reduction. Divided by
743 // two because of the global memory load in the kernel performing
744 // the first level of reduction!
745 int nBlocks, nThreads;
746 int halvedSize = ceil((float)n/2);
747
748 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
749 // Note: setThreadsLogical ensures that nThreads is a power of 2
750
751 if (nThreads < 64) {
752 // Thread blocks too small. Manually set nThreads to 64
754 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
755
756 // If the above was successful, print warning
757 Log::file() << "Warning: "
758 << "nThreads too small for parallel reduction.\n"
759 << "Setting nThreads equal to 64." << std::endl;
760 }
761
762 // Warp size must be 32
764
765 // Perform parallel reduction
766 if (i == 0) { // first reduction, use input array
767 temp1.allocate(nBlocks);
768 _min<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
769 (temp1.cArray(), in.cArray(), n);
770 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
771 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
772 temp2.allocate(nBlocks);
773 _min<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
774 (temp2.cArray(), temp1.cArray(), n);
775 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
776 temp1.deallocate();
777 } else { // i is even: reduce temp2, store in temp1
778 temp1.allocate(nBlocks);
779 _min<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
780 (temp1.cArray(), temp2.cArray(), n);
781 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
782 temp2.deallocate();
783 }
784
785 n = nBlocks;
786 i += 1;
787 }
788
789 // Transfer the partially reduced sum to the host
791 if (i == 0) {
792 temp_h = in;
793 } else if (i % 2 == 1) {
794 temp_h = temp1;
795 } else {
796 temp_h = temp2;
797 }
798
799 cudaReal min = temp_h[0];
800 for (int i = 1; i < n; i++) {
801 if (temp_h[i] < min) min = temp_h[i];
802 }
803 return min;
804}
805
806/*
807* Get minimum absolute magnitude of array elements (GPU kernel wrapper).
808*/
809cudaReal minAbs(DeviceArray<cudaReal> const & in)
810{
812 int n = in.capacity();
813
814 // Set up temporary device arrays for storing reduced data
815 DeviceArray<cudaReal> temp1, temp2;
816
817 int i = 0;
818
819 // Perform parallel reduction on GPU repeatedly until n < 1e5
820 // (note: for this wrapper, we always call the kernel at least once,
821 // even if n < 1e5, so that the part done on the CPU is always just
822 // comparing array element size, without needing fabs().)
823 while (n >= 1e5 || i == 0) {
824 // Establish GPU resources for this parallel reduction. Divided by
825 // two because of the global memory load in the kernel performing
826 // the first level of reduction!
827 int nBlocks, nThreads;
828 int halvedSize = ceil((float)n/2);
829
830 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
831 // Note: setThreadsLogical ensures that nThreads is a power of 2
832
833 if (nThreads < 64) {
834 // Thread blocks too small. Manually set nThreads to 64
836 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
837
838 // If the above was successful, print warning
839 Log::file() << "Warning: "
840 << "nThreads too small for parallel reduction.\n"
841 << "Setting nThreads equal to 64." << std::endl;
842 }
843
844 // Warp size must be 32
846
847 // Perform parallel reduction
848 if (i == 0) { // first reduction, use input array
849 temp1.allocate(nBlocks);
850 _minAbs<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
851 (temp1.cArray(), in.cArray(), n);
852 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
853 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
854 temp2.allocate(nBlocks);
855 _min<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
856 (temp2.cArray(), temp1.cArray(), n);
857 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
858 temp1.deallocate();
859 } else { // i is even: reduce temp2, store in temp1
860 temp1.allocate(nBlocks);
861 _min<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
862 (temp1.cArray(), temp2.cArray(), n);
863 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
864 temp2.deallocate();
865 }
866
867 n = nBlocks;
868 i += 1;
869 }
870
871 // Transfer the partially reduced sum to the host
873 if (i == 0) {
874 temp_h = in;
875 } else if (i % 2 == 1) {
876 temp_h = temp1;
877 } else {
878 temp_h = temp2;
879 }
880
881 cudaReal min = temp_h[0];
882 for (int i = 1; i < n; i++) {
883 if (temp_h[i] < min) min = temp_h[i];
884 }
885 return min;
886}
887
888/*
889* Compute inner product of two real arrays (GPU kernel wrapper).
890*/
892 DeviceArray<cudaReal> const & b)
893{
896 UTIL_CHECK(a.capacity() == b.capacity());
897 int n = a.capacity();
898
899 // Set up temporary device arrays for storing reduced data
900 DeviceArray<cudaReal> temp1, temp2;
901
902 int i = 0;
903
904 // Perform parallel reduction on GPU repeatedly until n < 1e5
905 // (note: for this wrapper, we always call the kernel at least once,
906 // even if n < 1e5, so that the part done on the CPU is always just
907 // adding up array elements.)
908 while (n >= 1e5 || i == 0) {
909 // Establish GPU resources for this parallel reduction. Divided by
910 // two because of the global memory load in the kernel performing
911 // the first level of reduction!
912 int nBlocks, nThreads;
913 int halvedSize = ceil((float)n/2);
914
915 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
916 // Note: setThreadsLogical ensures that nThreads is a power of 2
917
918 if (nThreads < 64) {
919 // Thread blocks too small. Manually set nThreads to 64
921 ThreadArray::setThreadsLogical(halvedSize,nBlocks,nThreads);
922
923 // If the above was successful, print warning
924 Log::file() << "Warning: "
925 << "nThreads too small for parallel reduction.\n"
926 << "Setting nThreads equal to 64." << std::endl;
927 }
928
929 // Warp size must be 32
931
932 // Perform parallel reduction
933
934 // (note: only the first kernel call uses _innerProduct. After
935 // that, we use _sum to reduce the array output by _innerProduct.)
936
937 if (i == 0) { // first reduction, use input arrays
938 temp1.allocate(nBlocks);
939 _innerProduct<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
940 (temp1.cArray(), a.cArray(), b.cArray(), n);
941 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
942 } else if (i % 2 == 1) { // i is odd: reduce temp1, store in temp2
943 temp2.allocate(nBlocks);
944 _sum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
945 (temp2.cArray(), temp1.cArray(), n);
946 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
947 temp1.deallocate();
948 } else { // i is even: reduce temp2, store in temp1
949 temp1.allocate(nBlocks);
950 _sum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>
951 (temp1.cArray(), temp2.cArray(), n);
952 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
953 temp2.deallocate();
954 }
955
956 n = nBlocks;
957 i += 1;
958 }
959
960 // Transfer the partially reduced sum to the host
962 if (i % 2 == 1) {
963 temp_h = temp1;
964 } else {
965 temp_h = temp2;
966 }
967
968 if (n == 1) {
969 return temp_h[0];
970 } else {
971 // Sum up elements of temp_h to get final result.
972 // Use Kahan summation to reduce accumulation of error
973 cudaReal sum = 0.0, tempVal, tempSum;
974 cudaReal err = 0.0;
975 for (int i = 0; i < n; ++i) {
976 tempVal = temp_h[i] - err;
977 tempSum = sum + tempVal;
978 err = tempSum - sum - tempVal;
979 sum = tempSum;
980 }
981 return sum;
982 }
983}
984
985}
986}
987}
988}
Dynamic array on the GPU device with aligned data.
Definition rpg/System.h:32
void deallocate()
Dellocate the underlying C array.
bool isAllocated() const
Return true if the array has been allocated, false otherwise.
int capacity() const
Return allocated capacity.
void allocate(int capacity)
Allocate the underlying C array on the device.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Definition HostDArray.h:43
static std::ostream & file()
Get log ostream by reference.
Definition Log.cpp:57
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
int warpSize()
Get the warp size.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
cudaReal min(DeviceArray< cudaReal > const &in)
Get minimum of array elements (GPU kernel wrapper).
Definition Reduce.cu:730
cudaReal innerProduct(DeviceArray< cudaReal > const &a, DeviceArray< cudaReal > const &b)
Compute inner product of two real arrays (GPU kernel wrapper).
Definition Reduce.cu:891
cudaReal maxAbs(DeviceArray< cudaReal > const &in)
Get maximum absolute magnitude of array elements (GPU kernel wrapper).
Definition Reduce.cu:648
cudaReal sum(DeviceArray< cudaReal > const &in)
Compute sum of array elements (GPU kernel wrapper).
Definition Reduce.cu:480
cudaReal minAbs(DeviceArray< cudaReal > const &in)
Get minimum absolute magnitude of array elements (GPU kernel wrapper).
Definition Reduce.cu:809
cudaReal max(DeviceArray< cudaReal > const &in)
Get maximum of array elements (GPU kernel wrapper).
Definition Reduce.cu:569
PSCF package top-level namespace.
Definition param_pc.dox:1