PSCF v1.4.0
Reduce.cu
1/*
2* PSCF - Polymer Self-Consistent Field
3*
4* Copyright 2015 - 2025, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "Reduce.h"
9#include "VecOp.h"
10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/DeviceMemory.h>
13#include <pscf/cuda/cudaErrorCheck.h>
14#include <pscf/cuda/complex.h>
15#include <util/misc/Log.h>
16
17//#include <thrust/reduce.h>
18//#include <thrust/device_vector.h>
19//#include <cub/device/device_reduce.cuh>
20#include <cub/cub.cuh>
21
22#include <complex>
23#include <cmath>
24
25namespace Pscf {
26namespace Reduce {
27
28 /*
29 * Reduction functions defined here use device-wide algorithms defined
30 * in the CUB library. The CUB library is part of the CUDA Core Compute
31 * Library collection that is distributed with the CUDA development kit.
32 *
33 * The CUB reduction functions require the user to provide workspace.
34 * To minimize time spent repeatedly allocating this workspace, the
35 * reduceSpace static variable is used as a memory block that grows as
36 * needed, but that can be re-used for operations that require a work
37 * space of size less than or equal to that of the largest previous
38 * required work space.
39 *
40 * The static variable transformSpace is used as a memory block to
41 * store the results of vector operations (such as taking the absolute
42 * magnitude or square of all elements) that are sometimes applied
43 * before applying a reduction operation.
44 *
45 * Both memory pools may be de-allocated by Reduce::freeWorkSpace(),
46 * but otherwise persist in memory for use as needed.
47 */
48
49 // Memory used as workspace by CUB reduction functions
50 static DeviceMemory reduceSpace_{};
51
52 // Memory used as workspace for vector operations (e.g, max or square)
53 static DeviceMemory transformSpace_{};
54
55 // Anonymous namespace, for entities that are only used in this file
56 namespace {
57
58 // Complex addition functor.
59 struct addComplexFunctor {
60
61 __host__ __device__ inline
62 cudaComplex operator() (cudaComplex const & a,
63 cudaComplex const & b) const
64 {
65 cudaComplex result;
66 result.x = a.x + b.x;
67 result.y = a.y + b.y;
68 return result;
69 }
70
71 };
72
73 /*
74 * Compute sum of elements of a real array.
75 */
76 cudaReal sum(cudaReal* inPtr, const int n)
77 {
78 UTIL_CHECK(n > 0);
79
80 // Create output pointer
81 DeviceArray<cudaReal> out(1);
82 cudaReal* outPtr = out.cArray();
83
84 // Determine size of required workspace, allocate if needed
85 size_t workSize = 0;
86 cudaError_t error;
87 error = cub::DeviceReduce::Sum(nullptr, workSize,
88 inPtr, outPtr, n);
89 UTIL_CHECK(error == cudaSuccess);
90 reduceSpace_.resize(workSize);
91 UTIL_CHECK(reduceSpace_.capacity() >= workSize);
92
93 // Perform reduction
94 error = cub::DeviceReduce::Sum(reduceSpace_.cArray(), workSize,
95 inPtr, outPtr, n);
96 UTIL_CHECK(error == cudaSuccess);
97
98 // Copy to host and return value
99 HostDArray<cudaReal> out_h;
100 out_h.allocate(1);
101 out_h = out;
102 return out_h[0];
103 }
104
105 /*
106 * Compute sum of elements of a complex array.
107 *
108 * Implementation uses Nvidia CUB Library functions.
109 */
110 std::complex<cudaReal> sum(cudaComplex* inPtr, const int n)
111 {
112 UTIL_CHECK(n > 0);
113
114 // Create output pointer
115 DeviceArray<cudaComplex> out(1);
116 cudaComplex* outPtr = out.cArray();
117
118 // Determine size of required workspace and allocate if necessary
119 size_t workSize = 0;
120 cudaError_t error;
121 auto op = addComplexFunctor{};
122 cudaComplex init = makeComplex(0.0, 0.0);
123 error = cub::DeviceReduce::Reduce(nullptr, workSize,
124 inPtr, outPtr, n, op, init);
125 UTIL_CHECK(error == cudaSuccess);
126 reduceSpace_.resize(workSize);
127
128 // Perform reduction
129 error = cub::DeviceReduce::Reduce(reduceSpace_.cArray(), workSize,
130 inPtr, outPtr, n, op, init);
131 UTIL_CHECK(error == cudaSuccess);
132
133 // Copy to host and return value
134 HostDArray<cudaComplex> out_h(1);
135 out_h = out;
136 return std::complex<cudaReal>(out_h[0].x, out_h[0].y);
137 }
138
139 /*
140 * Compute max of elements of a real array.
141 */
142 cudaReal max(cudaReal* inPtr, const int n)
143 {
144 UTIL_CHECK(n > 0);
145
146 // Create input and output pointers
147 DeviceArray<cudaReal> out(1);
148 cudaReal* outPtr = out.cArray();
149
150 // Determine size of required workspace, allocated if needed
151 size_t workSize = 0;
152 cudaError_t error;
153 error = cub::DeviceReduce::Max(nullptr, workSize,
154 inPtr, outPtr, n);
155 UTIL_CHECK(error == cudaSuccess);
156 reduceSpace_.resize(workSize);
157
158 // Perform reduction
159 error = cub::DeviceReduce::Max(reduceSpace_.cArray(), workSize,
160 inPtr, outPtr, n);
161 UTIL_CHECK(error == cudaSuccess);
162
163 // Copy to host and return value
164 HostDArray<cudaReal> out_h(1);
165 out_h = out;
166 return out_h[0];
167 }
168
169 /*
170 * Compute min of elements of a real array.
171 */
172 cudaReal min(cudaReal* inPtr, const int n)
173 {
174 UTIL_CHECK(n > 0);
175
176 // Create output pointer
177 DeviceArray<cudaReal> out(1);
178 cudaReal* outPtr = out.cArray();
179
180 // Determine size of required workspace, allocated if needed
181 size_t workSize = 0;
182 cudaError_t error;
183 error = cub::DeviceReduce::Min(nullptr, workSize,
184 inPtr, outPtr, n);
185 UTIL_CHECK(error == cudaSuccess);
186 reduceSpace_.resize(workSize);
187
188 // Perform reduction
189 error = cub::DeviceReduce::Min(reduceSpace_.cArray(), workSize,
190 inPtr, outPtr, n);
191 UTIL_CHECK(error == cudaSuccess);
192
193 // Copy to host and return value
194 HostDArray<cudaReal> out_h(1);
195 out_h = out;
196 return out_h[0];
197 }
198
199 } // anonmyous namespace Pscf::Reduce::(unnamed)
200
201 // Memory management
202
204 {
205 reduceSpace_.deallocate();
206 transformSpace_.deallocate();
207 }
208
209 // Reduction function definitions
210
211 /*
212 * Compute sum of elements of a real array.
213 *
214 * This implementation uses Nvidia CUB Library functions.
215 */
217 {
219 const int n = a.capacity();
220 UTIL_CHECK(n > 0);
221
222 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
223 return sum(inPtr, n);
224 }
225
226 /*
227 * Compute sum of elements of a real array slice.
228 *
229 * This implementation uses Nvidia CUB Library functions.
230 */
231 cudaReal sum(DeviceArray<cudaReal> const & a, int begin, int end)
232 {
234 UTIL_CHECK(begin >= 0);
235 UTIL_CHECK(end <= a.capacity());
236 const int n = end - begin;
237 UTIL_CHECK(n > 0);
238
239 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
240 inPtr += begin;
241 return sum(inPtr, n);
242 }
243
244 /*
245 * Compute sum of elements of a complex array.
246 *
247 * Implementation uses Nvidia CUB Library functions.
248 */
249 std::complex<cudaReal> sum(DeviceArray<cudaComplex> const & a)
250 {
252 const int n = a.capacity();
253 UTIL_CHECK(n > 0);
254
255 cudaComplex* inPtr = const_cast<cudaComplex*>( a.cArray() );
256 return sum(inPtr, n);
257 }
258
259 /*
260 * Compute sum of elements of a complex array slice.
261 *
262 * This implementation uses Nvidia CUB Library functions.
263 */
264 std::complex<cudaReal> sum(DeviceArray<cudaComplex> const & a,
265 int begin, int end)
266 {
268 UTIL_CHECK(begin >= 0);
269 UTIL_CHECK(end <= a.capacity());
270 const int n = end - begin;
271 UTIL_CHECK(n > 0);
272
273 cudaComplex* inPtr = const_cast<cudaComplex*>( a.cArray() );
274 inPtr += begin;
275 return sum(inPtr, n);
276 }
277
278 /*
279 * Return the sum of squares of elements of a real array.
280 */
282 {
284 int n = in.capacity();
285
286 // Set up temporary array for result of vector operation
287 int workSize = n * sizeof(cudaReal);
288 transformSpace_.resize(workSize);
290 temp.associate(transformSpace_, n);
291
292 // Compute an array of element-wise squares
293 VecOp::sqV(temp, in);
294
295 cudaReal result = Reduce::sum(temp);
296 temp.dissociate();
297
298 return result;
299 }
300
301 /*
302 * Return the sum of squares of elements of a complex array.
303 */
304 std::complex<cudaReal> sumSq(DeviceArray<cudaComplex> const & in)
305 {
307 int n = in.capacity();
308
309 // Set up temporary array for result of vector operation
310 int workSize = n * sizeof(cudaComplex);
311 transformSpace_.resize(workSize);
313 temp.associate(transformSpace_, n);
314
315 // Compute an array of element-wise squares
316 VecOp::sqV(temp, in);
317
318 // Compute sum of array temp of element-wise squares
319 std::complex<cudaReal> result = Reduce::sum(temp);
320 temp.dissociate();
321
322 return result;
323 }
324
325 /*
326 * Return the sum of square absolute magnitudes for a complex array.
327 */
329 {
331 int n = in.capacity();
332
333 // Set up temporary array for result of vector operation
334 int workSize = n * sizeof(cudaReal);
335 transformSpace_.resize(workSize);
337 temp.associate(transformSpace_, n);
338
339 // Compute an array of element-wise square absolute magnitudes
340 VecOp::sqAbsV(temp, in);
341
342 // Compute sum of array temp of element-wise square magnitudes
343 cudaReal result = Reduce::sum(temp);
344 temp.dissociate();
345
346 return result;
347 }
348
349 /*
350 * Compute inner product of two real arrays.
351 */
353 DeviceArray<cudaReal> const & b)
354 {
357 UTIL_CHECK(a.capacity() == b.capacity());
358 int n = a.capacity();
359
360 // Set up temporary array for result of vector operation
361 int workSize = n * sizeof(cudaReal);
362 transformSpace_.resize(workSize);
364 temp.associate(transformSpace_, n);
365
366 // Perform element-wise multiplication v[i] = a[i] * b[i]
367 VecOp::mulVV(temp, a, b);
368
369 // Compute sum of element-wise products
370 cudaReal result = Reduce::sum(temp);
371 temp.dissociate();
372
373 return result;
374 }
375
376 // Maxima
377
378 /*
379 * Compute max of elements of a real array.
380 */
382 {
384 const int n = a.capacity();
385 UTIL_CHECK(n > 0);
386
387 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
388 return max(inPtr, n);
389 }
390
391 /*
392 * Compute max of elements of a real array slice.
393 */
394 cudaReal max(DeviceArray<cudaReal> const & a, int begin, int end)
395 {
397 UTIL_CHECK(begin >= 0);
398 UTIL_CHECK(end <= a.capacity());
399 const int n = end - begin;
400 UTIL_CHECK(n > 0);
401
402 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
403 inPtr += begin;
404 return max(inPtr, n);
405 }
406
407 /*
408 * Return the maximum absolute magnitude of array elements.
409 */
411 {
413 int n = in.capacity();
414
415 // Set up temporary array for result of vector operation
416 int workSize = n * sizeof(cudaReal);
417 transformSpace_.resize(workSize);
419 temp.associate(transformSpace_, n);
420
421 // Compute an array of absolute magnitudes
422 VecOp::absV(temp, in);
423
424 // Compute maximum of array temp of absolute magnitudes
425 cudaReal result = Reduce::max(temp);
426 temp.dissociate();
427
428 return result;
429 }
430
431 // Minima
432
433 /*
434 * Compute minimum of all elements of a real array.
435 */
437 {
439 const int n = a.capacity();
440 UTIL_CHECK(n > 0);
441
442 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
443 return min(inPtr, n);
444 }
445
446 /*
447 * Compute minimum of elements of a real array slice.
448 */
449 cudaReal min(DeviceArray<cudaReal> const & a, int begin, int end)
450 {
452 UTIL_CHECK(begin >= 0);
453 UTIL_CHECK(end <= a.capacity());
454 const int n = end - begin;
455 UTIL_CHECK(n > 0);
456
457 cudaReal* inPtr = const_cast<cudaReal*>( a.cArray() );
458 inPtr += begin;
459 return min(inPtr, n);
460 }
461
462 /*
463 * Return the minimum absolute magnitude of array elements.
464 */
466 {
468 int n = in.capacity();
469
470 // Set up temporary array for result of vector operation
471 int workSize = n * sizeof(cudaReal);
472 transformSpace_.resize(workSize);
474 temp.associate(transformSpace_, n);
475
476 // Compute an array of absolute magnitudes
477 VecOp::absV(temp, in);
478
479 // Compute minimum of array temp of absolute magnitudes
480 cudaReal result = Reduce::min(temp);
481 temp.dissociate();
482
483 return result;
484 }
485
486} // namespace Reduce
487} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition DeviceArray.h:96
void dissociate()
Dissociate this object from an externally owned memory block.
bool isAllocated() const
Return true if the array has allocated data, false otherwise.
void associate(DeviceArray< Data > &arr, int beginId, int capacity)
Associate this object with a slice of a different DeviceArray.
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
double minAbs(Array< double > const &in)
Get minimum absolute magnitude of array elements .
Definition Reduce.cpp:229
double min(Array< double > const &in)
Get minimum of array elements .
Definition Reduce.cpp:198
double innerProduct(Array< double > const &a, Array< double > const &b)
Compute Euclidean inner product of two real arrays .
Definition Reduce.cpp:117
double maxAbs(Array< double > const &in)
Get maximum absolute magnitude of array elements .
Definition Reduce.cpp:182
double sumSq(Array< double > const &in)
Compute sum of of squares of array elements (real).
Definition Reduce.cpp:82
double sum(Array< double > const &in)
Compute sum of array elements (real).
Definition Reduce.cpp:20
double max(Array< double > const &in)
Get maximum of array elements (real).
Definition Reduce.cpp:151
void sqV(Array< double > &a, Array< double > const &b)
Vector element-wise square, a[i] = b[i]*b[i] (real).
Definition VecOp.cpp:334
void sqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Square of absolute magnitude, a[i] = |b[i]|^2 (complex).
Definition VecOpCx.cpp:698
void absV(Array< double > &a, Array< double > const &b)
Element-wise absolute magnitude, a[i] = abs(b[i]) (real).
Definition VecOp.cpp:349
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
Definition VecOp.cpp:125
cudaReal sumSqAbs(DeviceArray< cudaComplex > const &in)
Return sum of squared magnitudes of elements of a complex array.
Definition Reduce.cu:328
void freeWorkSpace()
Free any private work space currently allocated for reductions.
Definition Reduce.cu:203
void init()
Initialize static variables in Pscf::ThreadArray namespace.
Reduction operations performed on a CPU or GPU.
Definition Reduce.cpp:13
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
Definition cudaTypes.h:22
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.
Definition cudaTypes.h:35