10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/cudaErrorCheck.h>
35 __global__
void _addVcVc(cudaReal* a, cudaReal
const * b, cudaReal
const c,
36 cudaReal
const * d, cudaReal
const e,
const int n)
38 int nThreads = blockDim.x * gridDim.x;
39 int startID = blockIdx.x * blockDim.x + threadIdx.x;
40 for(
int i = startID; i < n; i +=
nThreads) {
41 a[i] = (b[i] * c) + (d[i] * e);
57 __global__
void _addVcVcVc(cudaReal* a, cudaReal
const * b, cudaReal
const c,
58 cudaReal
const * d, cudaReal
const e,
59 cudaReal
const * f, cudaReal
const g,
const int n)
61 int nThreads = blockDim.x * gridDim.x;
62 int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for(
int i = startID; i < n; i +=
nThreads) {
64 a[i] = (b[i] * c) + (d[i] * e) + (f[i] * g);
76 __global__
void _addEqVc(cudaReal* a, cudaReal
const * b,
77 cudaReal
const c,
const int n)
79 int nThreads = blockDim.x * gridDim.x;
80 int startID = blockIdx.x * blockDim.x + threadIdx.x;
81 for(
int i = startID; i < n; i +=
nThreads) {
95 __global__
void _subVVS(cudaReal* a, cudaReal
const * b,
96 cudaReal
const * c, cudaReal
const d,
const int n)
98 int nThreads = blockDim.x * gridDim.x;
99 int startID = blockIdx.x * blockDim.x + threadIdx.x;
100 for(
int i = startID; i < n; i +=
nThreads) {
101 a[i] = b[i] - c[i] -
d;
113 __global__
void _divEqVc(cudaComplex* a, cudaReal
const * b,
114 cudaReal
const c,
const int n)
116 int nThreads = blockDim.x * gridDim.x;
117 int startID = blockIdx.x * blockDim.x + threadIdx.x;
118 for(
int i = startID; i < n; i +=
nThreads) {
119 a[i].x /= (b[i] * c);
120 a[i].y /= (b[i] * c);
132 __global__
void _expVc(cudaReal* a, cudaReal
const * b,
133 cudaReal
const c,
const int n)
135 int nThreads = blockDim.x * gridDim.x;
136 int startID = blockIdx.x * blockDim.x + threadIdx.x;
137 for(
int i = startID; i < n; i +=
nThreads) {
138 a[i] = exp(b[i] * c);
150 __global__
void _eqVPair(cudaReal* a1, cudaReal* a2,
151 cudaReal
const * s,
const int n)
153 int nThreads = blockDim.x * gridDim.x;
154 int startID = blockIdx.x * blockDim.x + threadIdx.x;
156 for (
int i = startID; i < n; i +=
nThreads) {
173 __global__
void _mulVVPair(cudaReal* a1, cudaReal * a2,
174 cudaReal
const * b1, cudaReal
const * b2,
175 cudaReal
const * s,
const int n)
177 int nThreads = blockDim.x * gridDim.x;
178 int startID = blockIdx.x * blockDim.x + threadIdx.x;
180 for (
int i = startID; i < n; i +=
nThreads) {
182 a1[i] = b1[i] * input;
183 a2[i] = b2[i] * input;
195 __global__
void _mulEqVPair(cudaReal* a1, cudaReal* a2,
196 cudaReal
const * s,
const int n)
198 int nThreads = blockDim.x * gridDim.x;
199 int startID = blockIdx.x * blockDim.x + threadIdx.x;
201 for (
int i = startID; i < n; i +=
nThreads) {
222 __global__
void _addVMany(cudaReal* a, cudaReal
const ** vecs,
223 const int nVecs,
const int n)
225 int nThreads = blockDim.x * gridDim.x;
226 int startID = blockIdx.x * blockDim.x + threadIdx.x;
227 for (
int i = startID; i < n; i +=
nThreads) {
228 cudaReal
sum = vecs[0][i];
229 for (
int j = 1; j < nVecs; j++) {
250 __global__
void _mulVMany(cudaReal* a, cudaReal
const ** vecs,
251 const int nVecs,
const int n)
253 int nThreads = blockDim.x * gridDim.x;
254 int startID = blockIdx.x * blockDim.x + threadIdx.x;
255 for (
int i = startID; i < n; i +=
nThreads) {
256 cudaReal prod = vecs[0][i];
257 for (
int j = 1; j < nVecs; j++) {
271 __global__
void _sqNormV(cudaReal* a, cudaComplex
const * b,
const int n)
273 int nThreads = blockDim.x * gridDim.x;
274 int startID = blockIdx.x * blockDim.x + threadIdx.x;
275 for(
int i = startID; i < n; i +=
nThreads) {
276 a[i] = (b[i].x * b[i].x) + (b[i].y * b[i].y);
287 __global__
void _sqSqNormV(cudaReal* a, cudaComplex
const * b,
const int n)
289 int nThreads = blockDim.x * gridDim.x;
290 int startID = blockIdx.x * blockDim.x + threadIdx.x;
292 for(
int i = startID; i < n; i +=
nThreads) {
293 tmp = (b[i].x * b[i].x) + (b[i].y * b[i].y);
313 int nBlocks, nThreads;
317 _addVcVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, d.cArray(), e, n);
318 cudaErrorCheck( cudaGetLastError() );
333 int nBlocks, nThreads;
337 _addVcVcVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, d.cArray(), e,
339 cudaErrorCheck( cudaGetLastError() );
350 int nBlocks, nThreads;
354 _addEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
355 cudaErrorCheck( cudaGetLastError() );
367 int nBlocks, nThreads;
373 cudaErrorCheck( cudaGetLastError() );
384 int nBlocks, nThreads;
388 _divEqVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
389 cudaErrorCheck( cudaGetLastError() );
400 int nBlocks, nThreads;
404 _expVc<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), c, n);
405 cudaErrorCheck( cudaGetLastError() );
417 int nBlocks, nThreads;
422 cudaErrorCheck( cudaGetLastError() );
438 int nBlocks, nThreads;
444 cudaErrorCheck( cudaGetLastError() );
456 int nBlocks, nThreads;
461 cudaErrorCheck( cudaGetLastError() );
468 int nVecs = vecs.capacity();
470 int n = vecs[0].capacity();
473 addVV(a, vecs[0], vecs[1]);
479 for (
int i = 0; i < nVecs; i++) {
480 vecs_h[i] = vecs[i].
cArray();
486 int nBlocks, nThreads;
490 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
491 cudaErrorCheck( cudaGetLastError() );
498 int nVecs = vecs.capacity();
500 int n = vecs[0]->capacity();
503 addVV(a, *vecs[0], *vecs[1]);
509 for (
int i = 0; i < nVecs; i++) {
510 vecs_h[i] = vecs[i]->
cArray();
516 int nBlocks, nThreads;
520 _addVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
521 cudaErrorCheck( cudaGetLastError() );
528 int nVecs = vecs.capacity();
530 int n = vecs[0].capacity();
533 mulVV(a, vecs[0], vecs[1]);
539 for (
int i = 0; i < nVecs; i++) {
540 vecs_h[i] = vecs[i].
cArray();
546 int nBlocks, nThreads;
550 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
551 cudaErrorCheck( cudaGetLastError() );
558 int nVecs = vecs.capacity();
560 int n = vecs[0]->capacity();
563 mulVV(a, *vecs[0], *vecs[1]);
569 for (
int i = 0; i < nVecs; i++) {
570 vecs_h[i] = vecs[i]->
cArray();
576 int nBlocks, nThreads;
580 _mulVMany<<<nBlocks, nThreads>>>(a.
cArray(), vecs_d.
cArray(), nVecs, n);
581 cudaErrorCheck( cudaGetLastError() );
591 int nBlocks, nThreads;
595 _sqNormV<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), n);
596 cudaErrorCheck( cudaGetLastError() );
606 int nBlocks, nThreads;
610 _sqSqNormV<<<nBlocks, nThreads>>>(a.
cArray(), b.
cArray(), n);
611 cudaErrorCheck( cudaGetLastError() );
Dynamic array on the GPU device with aligned data.
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Data * cArray()
Return a pointer to the underlying C array.
Dynamically allocatable contiguous array template.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
double d(double ksq, double length, double kuhn)
Compute and return a homopolymer correlation function.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
cudaReal sum(DeviceArray< cudaReal > const &in)
Compute sum of array elements (GPU kernel wrapper).
void addVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Add an undefined number of vectors pointwise, kernel wrapper.
void addVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
void addVcVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e, DeviceArray< cudaReal > const &f, cudaReal const g)
3-vec addition w coeff, a[i] = (b[i]*c) + (d[i]*e) + (f[i]*g), kernel wrapper.
void mulVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
void expVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c), kernel wrapper.
void sqSqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Norm of complex number to the 4th power, a[i] = norm(b[i])^4, kernel wrapper.
void mulVVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &b1, DeviceArray< cudaReal > const &b2, DeviceArray< cudaReal > const &s)
Vector multiplication in pairs, ax[i] = bx[i] * s[i], kernel wrapper.
void addVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e)
Vector addition w/ coefficient, a[i] = (b[i]*c) + (d[i]*e), kernel wrapper.
void mulVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Multiply an undefined number of vectors pointwise, kernel wrapper.
void subVVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, cudaReal const d)
Vector subtraction, a[i] = b[i] - c[i] - d, kernel wrapper.
void eqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
Vector assignment in pairs, ax[i] = b[i], kernel wrapper.
void sqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Squared norm of complex number, a[i] = norm(b[i])^2, kernel wrapper.
void divEqVc(DeviceArray< cudaComplex > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector division in-place w/ coeff., a[i] /= (b[i] * c), kernel wrapper.
void mulEqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
In-place vector multiplication in pairs, ax[i] *= s[i], kernel wrapper.
void addEqVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector addition in-place w/ coefficient, a[i] += b[i] * c, kernel wrapper.
PSCF package top-level namespace.