10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/DeviceMemory.h>
13#include <pscf/cuda/cudaErrorCheck.h>
14#include <pscf/cuda/complex.h>
15#include <util/misc/Log.h>
50 static DeviceMemory reduceSpace_{};
53 static DeviceMemory transformSpace_{};
59 struct addComplexFunctor {
61 __host__ __device__
inline
81 DeviceArray<cudaReal> out(1);
87 error = cub::DeviceReduce::Sum(
nullptr, workSize,
90 reduceSpace_.resize(workSize);
91 UTIL_CHECK(reduceSpace_.capacity() >= workSize);
94 error = cub::DeviceReduce::Sum(reduceSpace_.cArray(), workSize,
99 HostDArray<cudaReal> out_h;
115 DeviceArray<cudaComplex> out(1);
121 auto op = addComplexFunctor{};
123 error = cub::DeviceReduce::Reduce(
nullptr, workSize,
124 inPtr, outPtr, n, op, init);
126 reduceSpace_.resize(workSize);
129 error = cub::DeviceReduce::Reduce(reduceSpace_.cArray(), workSize,
130 inPtr, outPtr, n, op, init);
134 HostDArray<cudaComplex> out_h(1);
136 return std::complex<cudaReal>(out_h[0].x, out_h[0].y);
147 DeviceArray<cudaReal> out(1);
153 error = cub::DeviceReduce::Max(
nullptr, workSize,
156 reduceSpace_.resize(workSize);
159 error = cub::DeviceReduce::Max(reduceSpace_.cArray(), workSize,
164 HostDArray<cudaReal> out_h(1);
177 DeviceArray<cudaReal> out(1);
183 error = cub::DeviceReduce::Min(
nullptr, workSize,
186 reduceSpace_.resize(workSize);
189 error = cub::DeviceReduce::Min(reduceSpace_.cArray(), workSize,
194 HostDArray<cudaReal> out_h(1);
205 reduceSpace_.deallocate();
206 transformSpace_.deallocate();
223 return sum(inPtr, n);
236 const int n = end - begin;
241 return sum(inPtr, n);
256 return sum(inPtr, n);
270 const int n = end - begin;
275 return sum(inPtr, n);
287 int workSize = n *
sizeof(
cudaReal);
288 transformSpace_.resize(workSize);
311 transformSpace_.resize(workSize);
334 int workSize = n *
sizeof(
cudaReal);
335 transformSpace_.resize(workSize);
361 int workSize = n *
sizeof(
cudaReal);
362 transformSpace_.resize(workSize);
388 return max(inPtr, n);
399 const int n = end - begin;
404 return max(inPtr, n);
416 int workSize = n *
sizeof(
cudaReal);
417 transformSpace_.resize(workSize);
443 return min(inPtr, n);
454 const int n = end - begin;
459 return min(inPtr, n);
471 int workSize = n *
sizeof(
cudaReal);
472 transformSpace_.resize(workSize);
Dynamic array on the GPU device with aligned data.
void dissociate()
Dissociate this object from an externally owned memory block.
bool isAllocated() const
Return true if the array has allocated data, false otherwise.
void associate(DeviceArray< Data > &arr, int beginId, int capacity)
Associate this object with a slice of a different DeviceArray.
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
double minAbs(Array< double > const &in)
Get minimum absolute magnitude of array elements .
double min(Array< double > const &in)
Get minimum of array elements .
double innerProduct(Array< double > const &a, Array< double > const &b)
Compute Euclidean inner product of two real arrays .
double maxAbs(Array< double > const &in)
Get maximum absolute magnitude of array elements .
double sumSq(Array< double > const &in)
Compute sum of of squares of array elements (real).
double sum(Array< double > const &in)
Compute sum of array elements (real).
double max(Array< double > const &in)
Get maximum of array elements (real).
void sqV(Array< double > &a, Array< double > const &b)
Vector element-wise square, a[i] = b[i]*b[i] (real).
void sqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Square of absolute magnitude, a[i] = |b[i]|^2 (complex).
void absV(Array< double > &a, Array< double > const &b)
Element-wise absolute magnitude, a[i] = abs(b[i]) (real).
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
cudaReal sumSqAbs(DeviceArray< cudaComplex > const &in)
Return sum of squared magnitudes of elements of a complex array.
void freeWorkSpace()
Free any private work space currently allocated for reductions.
void init()
Initialize static variables in Pscf::ThreadArray namespace.
Reduction operations performed on a CPU or GPU.
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.