10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/cudaErrorCheck.h>
39 int nThreads = blockDim.x * gridDim.x;
40 int startID = blockIdx.x * blockDim.x + threadIdx.x;
41 for (
int i = startID; i < n; i +=
nThreads) {
42 a[i] = (b1[i] * c1) + (b2[i] * c2);
61 int nThreads = blockDim.x * gridDim.x;
62 int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for (
int i = startID; i < n; i +=
nThreads) {
81 int nThreads = blockDim.x * gridDim.x;
82 int startID = blockIdx.x * blockDim.x + threadIdx.x;
83 for (
int i = startID; i < n; i +=
nThreads) {
107 int nThreads = blockDim.x * gridDim.x;
108 int startID = blockIdx.x * blockDim.x + threadIdx.x;
109 for (
int i = startID; i < n; i +=
nThreads) {
110 a[i] = (b1[i] * c1) + (b2[i] * c2) + (b3[i] * c3);
131 int nThreads = blockDim.x * gridDim.x;
132 int startID = blockIdx.x * blockDim.x + threadIdx.x;
133 for (
int i = startID; i < n; i +=
nThreads) {
134 a[i] = (b1[i] * c1) + (b2[i] * c2) + s;
157 int nThreads = blockDim.x * gridDim.x;
158 int startID = blockIdx.x * blockDim.x + threadIdx.x;
159 for (
int i = startID; i < n; i +=
nThreads) {
160 a[i] = b[i] - c[i] - d;
179 int nThreads = blockDim.x * gridDim.x;
180 int startID = blockIdx.x * blockDim.x + threadIdx.x;
181 for (
int i = startID; i < n; i +=
nThreads) {
182 a[i].x /= (b[i] * c);
183 a[i].y /= (b[i] * c);
201 int nThreads = blockDim.x * gridDim.x;
202 int startID = blockIdx.x * blockDim.x + threadIdx.x;
203 for (
int i = startID; i < n; i +=
nThreads) {
204 a[i] = exp(b[i] * c);
222 int nThreads = blockDim.x * gridDim.x;
223 int startID = blockIdx.x * blockDim.x + threadIdx.x;
225 for (
int i = startID; i < n; i +=
nThreads) {
247 int nThreads = blockDim.x * gridDim.x;
248 int startID = blockIdx.x * blockDim.x + threadIdx.x;
250 for (
int i = startID; i < n; i +=
nThreads) {
252 a1[i] = b1[i] * input;
253 a2[i] = b2[i] * input;
269 int nThreads = blockDim.x * gridDim.x;
270 int startID = blockIdx.x * blockDim.x + threadIdx.x;
272 for (
int i = startID; i < n; i +=
nThreads) {
296 const int nVecs,
const int n)
298 int nThreads = blockDim.x * gridDim.x;
299 int startID = blockIdx.x * blockDim.x + threadIdx.x;
300 for (
int i = startID; i < n; i +=
nThreads) {
302 for (
int j = 1; j < nVecs; j++) {
326 const int nVecs,
const int n)
328 int nThreads = blockDim.x * gridDim.x;
329 int startID = blockIdx.x * blockDim.x + threadIdx.x;
330 for (
int i = startID; i < n; i +=
nThreads) {
332 for (
int j = 1; j < nVecs; j++) {
349 int nThreads = blockDim.x * gridDim.x;
350 int startID = blockIdx.x * blockDim.x + threadIdx.x;
352 for (
int i = startID; i < n; i +=
nThreads) {
353 tmp = (b[i].x * b[i].x) + (b[i].y * b[i].y);
374 int nBlocks, nThreads;
378 _addVcVc<<<nBlocks, nThreads>>>(a.
cArray(),
381 cudaErrorCheck( cudaGetLastError() );
395 int nBlocks, nThreads;
399 _addVcS<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, s, n);
400 cudaErrorCheck( cudaGetLastError() );
417 int nBlocks, nThreads;
421 _addVcVcVc<<<nBlocks, nThreads>>>(a.
cArray(),
425 cudaErrorCheck( cudaGetLastError() );
441 int nBlocks, nThreads;
445 _addVcVcS<<<nBlocks, nThreads>>>(a.
cArray(),
449 cudaErrorCheck( cudaGetLastError() );
465 int nBlocks, nThreads;
469 _addEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
470 cudaErrorCheck( cudaGetLastError() );
483 int nBlocks, nThreads;
487 _divEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
488 cudaErrorCheck( cudaGetLastError() );
501 int nBlocks, nThreads;
505 _expVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
506 cudaErrorCheck( cudaGetLastError() );
520 int nBlocks, nThreads;
524 _eqVPair<<<nBlocks, nThreads>>>(a1.
cArray(), a2.
cArray(),
526 cudaErrorCheck( cudaGetLastError() );
544 int nBlocks, nThreads;
548 _mulVVPair<<<nBlocks, nThreads>>>(a1.
cArray(), a2.
cArray(),
551 cudaErrorCheck( cudaGetLastError() );
565 int nBlocks, nThreads;
569 _mulEqVPair<<<nBlocks, nThreads>>>(a1.
cArray(), a2.
cArray(),
571 cudaErrorCheck( cudaGetLastError() );
580 int nVecs = vecs.capacity();
582 int n = vecs[0].capacity();
585 addVV(a, vecs[0], vecs[1]);
591 for (
int i = 0; i < nVecs; i++) {
592 vecs_h[i] = vecs[i].
cArray();
598 int nBlocks, nThreads;
602 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(),
603 vecs_d.
cArray(), nVecs, n);
604 cudaErrorCheck( cudaGetLastError() );
613 int nVecs = vecs.capacity();
615 int n = vecs[0]->capacity();
618 addVV(a, *vecs[0], *vecs[1]);
624 for (
int i = 0; i < nVecs; i++) {
625 vecs_h[i] = vecs[i]->
cArray();
631 int nBlocks, nThreads;
635 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(),
636 vecs_d.
cArray(), nVecs, n);
637 cudaErrorCheck( cudaGetLastError() );
646 int nVecs = vecs.capacity();
648 int n = vecs[0].capacity();
651 mulVV(a, vecs[0], vecs[1]);
657 for (
int i = 0; i < nVecs; i++) {
658 vecs_h[i] = vecs[i].
cArray();
664 int nBlocks, nThreads;
668 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(),
670 cudaErrorCheck( cudaGetLastError() );
679 int nVecs = vecs.capacity();
681 int n = vecs[0]->capacity();
684 mulVV(a, *vecs[0], *vecs[1]);
690 for (
int i = 0; i < nVecs; i++) {
691 vecs_h[i] = vecs[i]->
cArray();
697 int nBlocks, nThreads;
701 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(),
702 vecs_d.
cArray(), nVecs, n);
703 cudaErrorCheck( cudaGetLastError() );
716 int nBlocks, nThreads;
720 _sqSqAbsV<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), n);
721 cudaErrorCheck( cudaGetLastError() );
Dynamic array on the GPU device with aligned data.
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Data * cArray()
Return a pointer to the underlying C array.
Dynamically allocatable contiguous array template.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
double sum(Array< double > const &in)
Compute sum of array elements (real).
void sqSqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Fourth power of absolute magnitude, a[i] = |b[i]|^4 (complex).
void expVc(Array< double > &a, Array< double > const &b, const double c)
Exponentiation a scaled vector, a[i] = exp(b[i]*c) (real).
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
void addVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector addition, a[i] = b[i] + c[i] (real)
void divEqVc(Array< fftw_complex > &a, Array< double > const &b, double c)
Vector division in-place w/ coeff., a[i] /= (b[i] * c).
void addEqVc(Array< double > &a, Array< double > const &b, const double c)
Add scaled vector in-place, a[i] += b[i]*c (real).
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
void mulVVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b1, Array< double > const &b2, Array< double > const &c)
Vector multiplication in pairs, ax[i] = bx[i] * s[i], x=1,2.
void eqVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b)
Vector assignment in pairs, ax[i] = b[i], x = 1, 2.
void addVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Add an arbitrary number of vectors pointwise (real).
void mulVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Multiply an undefined number of vectors pointwise (real).
void mulEqVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b)
In-place vector multiplication in pairs, ax[i] *= b[i], x=1,2.
Vector operations on GPU or CPU.
void addVcVcS(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2, const double s)
Add scaled vectors + scalar, a[i] = b1[i]*c1 + b2[2]*c2 + s (real).
void addVcS(Array< double > &a, Array< double > const &b, const double c, const double s)
Add a scaled vector and a scalar, a[i] = b[i]*c + s (real).
void addVcVc(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2)
Add two scaled vectors, a[i] = b1[i]*c1 + b2[2]*c2 (real).
void addVcVcVc(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2, Array< double > const &b3, const double c3)
Add scaled vectors, a[i] = b1[i]*c1 + b2[i]*c2 + b3[i]*c3 (real).
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.