10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/cudaErrorCheck.h>
35 __global__
void _addVcVc(cudaReal* a, cudaReal
const * b, cudaReal
const c,
36 cudaReal
const * d, cudaReal
const e,
const int n)
38 int nThreads = blockDim.x * gridDim.x;
39 int startID = blockIdx.x * blockDim.x + threadIdx.x;
40 for(
int i = startID; i < n; i +=
nThreads) {
41 a[i] = (b[i] * c) + (d[i] * e);
57 __global__
void _addVcVcVc(cudaReal* a, cudaReal
const * b, cudaReal
const c,
58 cudaReal
const * d, cudaReal
const e,
59 cudaReal
const * f, cudaReal
const g,
const int n)
61 int nThreads = blockDim.x * gridDim.x;
62 int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for(
int i = startID; i < n; i +=
nThreads) {
64 a[i] = (b[i] * c) + (d[i] * e) + (f[i] * g);
76 __global__
void _addEqVc(cudaReal* a, cudaReal
const * b,
77 cudaReal
const c,
const int n)
79 int nThreads = blockDim.x * gridDim.x;
80 int startID = blockIdx.x * blockDim.x + threadIdx.x;
81 for(
int i = startID; i < n; i +=
nThreads) {
95 __global__
void _subVVS(cudaReal* a, cudaReal
const * b,
96 cudaReal
const * c, cudaReal
const d,
const int n)
98 int nThreads = blockDim.x * gridDim.x;
99 int startID = blockIdx.x * blockDim.x + threadIdx.x;
100 for(
int i = startID; i < n; i +=
nThreads) {
101 a[i] = b[i] - c[i] - d;
113 __global__
void _divEqVc(cudaComplex* a, cudaReal
const * b,
114 cudaReal
const c,
const int n)
116 int nThreads = blockDim.x * gridDim.x;
117 int startID = blockIdx.x * blockDim.x + threadIdx.x;
118 for(
int i = startID; i < n; i +=
nThreads) {
119 a[i].x /= (b[i] * c);
120 a[i].y /= (b[i] * c);
132 __global__
void _expVc(cudaReal* a, cudaReal
const * b,
133 cudaReal
const c,
const int n)
135 int nThreads = blockDim.x * gridDim.x;
136 int startID = blockIdx.x * blockDim.x + threadIdx.x;
137 for(
int i = startID; i < n; i +=
nThreads) {
138 a[i] = exp(b[i] * c);
150 __global__
void _eqVPair(cudaReal* a1, cudaReal* a2,
151 cudaReal
const * s,
const int n)
153 int nThreads = blockDim.x * gridDim.x;
154 int startID = blockIdx.x * blockDim.x + threadIdx.x;
156 for (
int i = startID; i < n; i +=
nThreads) {
173 __global__
void _mulVVPair(cudaReal* a1, cudaReal * a2,
174 cudaReal
const * b1, cudaReal
const * b2,
175 cudaReal
const * s,
const int n)
177 int nThreads = blockDim.x * gridDim.x;
178 int startID = blockIdx.x * blockDim.x + threadIdx.x;
180 for (
int i = startID; i < n; i +=
nThreads) {
182 a1[i] = b1[i] * input;
183 a2[i] = b2[i] * input;
195 __global__
void _mulEqVPair(cudaReal* a1, cudaReal* a2,
196 cudaReal
const * s,
const int n)
198 int nThreads = blockDim.x * gridDim.x;
199 int startID = blockIdx.x * blockDim.x + threadIdx.x;
201 for (
int i = startID; i < n; i +=
nThreads) {
222 __global__
void _addVMany(cudaReal* a, cudaReal
const ** vecs,
223 const int nVecs,
const int n)
225 int nThreads = blockDim.x * gridDim.x;
226 int startID = blockIdx.x * blockDim.x + threadIdx.x;
227 for (
int i = startID; i < n; i +=
nThreads) {
228 cudaReal
sum = vecs[0][i];
229 for (
int j = 1; j < nVecs; j++) {
250 __global__
void _mulVMany(cudaReal* a, cudaReal
const ** vecs,
251 const int nVecs,
const int n)
253 int nThreads = blockDim.x * gridDim.x;
254 int startID = blockIdx.x * blockDim.x + threadIdx.x;
255 for (
int i = startID; i < n; i +=
nThreads) {
256 cudaReal prod = vecs[0][i];
257 for (
int j = 1; j < nVecs; j++) {
271 __global__
void _sqNormV(cudaReal* a, cudaComplex
const * b,
const int n)
273 int nThreads = blockDim.x * gridDim.x;
274 int startID = blockIdx.x * blockDim.x + threadIdx.x;
275 for(
int i = startID; i < n; i +=
nThreads) {
276 a[i] = (b[i].x * b[i].x) + (b[i].y * b[i].y);
287 __global__
void _sqSqNormV(cudaReal* a, cudaComplex
const * b,
const int n)
289 int nThreads = blockDim.x * gridDim.x;
290 int startID = blockIdx.x * blockDim.x + threadIdx.x;
292 for(
int i = startID; i < n; i +=
nThreads) {
293 tmp = (b[i].x * b[i].x) + (b[i].y * b[i].y);
313 int nBlocks, nThreads;
318 cudaErrorCheck( cudaGetLastError() );
333 int nBlocks, nThreads;
339 cudaErrorCheck( cudaGetLastError() );
350 int nBlocks, nThreads;
354 _addEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
355 cudaErrorCheck( cudaGetLastError() );
367 int nBlocks, nThreads;
373 cudaErrorCheck( cudaGetLastError() );
384 int nBlocks, nThreads;
388 _divEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
389 cudaErrorCheck( cudaGetLastError() );
400 int nBlocks, nThreads;
404 _expVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
405 cudaErrorCheck( cudaGetLastError() );
417 int nBlocks, nThreads;
422 cudaErrorCheck( cudaGetLastError() );
438 int nBlocks, nThreads;
444 cudaErrorCheck( cudaGetLastError() );
456 int nBlocks, nThreads;
461 cudaErrorCheck( cudaGetLastError() );
468 int nVecs = vecs.capacity();
470 int n = vecs[0].capacity();
473 addVV(a, vecs[0], vecs[1]);
479 for (
int i = 0; i < nVecs; i++) {
480 vecs_h[i] = vecs[i].
cArray();
486 int nBlocks, nThreads;
490 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
491 cudaErrorCheck( cudaGetLastError() );
498 int nVecs = vecs.capacity();
500 int n = vecs[0]->capacity();
503 addVV(a, *vecs[0], *vecs[1]);
509 for (
int i = 0; i < nVecs; i++) {
510 vecs_h[i] = vecs[i]->
cArray();
516 int nBlocks, nThreads;
520 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
521 cudaErrorCheck( cudaGetLastError() );
528 int nVecs = vecs.capacity();
530 int n = vecs[0].capacity();
533 mulVV(a, vecs[0], vecs[1]);
539 for (
int i = 0; i < nVecs; i++) {
540 vecs_h[i] = vecs[i].
cArray();
546 int nBlocks, nThreads;
550 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
551 cudaErrorCheck( cudaGetLastError() );
558 int nVecs = vecs.capacity();
560 int n = vecs[0]->capacity();
563 mulVV(a, *vecs[0], *vecs[1]);
569 for (
int i = 0; i < nVecs; i++) {
570 vecs_h[i] = vecs[i]->
cArray();
576 int nBlocks, nThreads;
580 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
581 cudaErrorCheck( cudaGetLastError() );
591 int nBlocks, nThreads;
595 _sqNormV<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), n);
596 cudaErrorCheck( cudaGetLastError() );
606 int nBlocks, nThreads;
610 _sqSqNormV<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), n);
611 cudaErrorCheck( cudaGetLastError() );
Dynamic array on the GPU device with aligned data.
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Data * cArray()
Return a pointer to the underlying C array.
Dynamically allocatable contiguous array template.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
cudaReal sum(DeviceArray< cudaReal > const &in)
Compute sum of array elements (GPU kernel wrapper).
void addVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Add an undefined number of vectors pointwise, kernel wrapper.
void addVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
void addVcVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e, DeviceArray< cudaReal > const &f, cudaReal const g)
3-vec addition w coeff, a[i] = (b[i]*c) + (d[i]*e) + (f[i]*g), kernel wrapper.
void mulVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
void expVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c), kernel wrapper.
void sqSqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Norm of complex number to the 4th power, a[i] = norm(b[i])^4, kernel wrapper.
void mulVVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &b1, DeviceArray< cudaReal > const &b2, DeviceArray< cudaReal > const &s)
Vector multiplication in pairs, ax[i] = bx[i] * s[i], kernel wrapper.
void addVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e)
Vector addition w/ coefficient, a[i] = (b[i]*c) + (d[i]*e), kernel wrapper.
void mulVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Multiply an undefined number of vectors pointwise, kernel wrapper.
void subVVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, cudaReal const d)
Vector subtraction, a[i] = b[i] - c[i] - d, kernel wrapper.
void eqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
Vector assignment in pairs, ax[i] = b[i], kernel wrapper.
void sqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Squared norm of complex number, a[i] = norm(b[i])^2, kernel wrapper.
void divEqVc(DeviceArray< cudaComplex > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector division in-place w/ coeff., a[i] /= (b[i] * c), kernel wrapper.
void mulEqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
In-place vector multiplication in pairs, ax[i] *= s[i], kernel wrapper.
void addEqVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector addition in-place w/ coefficient, a[i] += b[i] * c, kernel wrapper.
PSCF package top-level namespace.