PSCF v1.2
VecOpMisc.cu
1/*
2* PSCF Package
3*
4* Copyright 2016 - 2022, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "VecOpMisc.h"
9#include "VecOp.h"
10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/cudaErrorCheck.h>
13#include <cmath>
14
15namespace Pscf {
16namespace Prdc {
17namespace Cuda {
18namespace VecOp {
19
20// CUDA kernels:
21// (defined in anonymous namespace, used within this file only)
22
23namespace {
24
25 /*
26 * Vector addition w/ coefficient, a[i] = (b[i]*c) + (d[i]*e), GPU kernel.
27 *
28 * \param a output array (LHS)
29 * \param b input array 1 (RHS)
30 * \param c input scalar 1 (RHS)
31 * \param d input array 2 (RHS)
32 * \param e input scalar 2 (RHS)
33 * \param n size of arrays
34 */
35 __global__ void _addVcVc(cudaReal* a, cudaReal const * b, cudaReal const c,
36 cudaReal const * d, cudaReal const e, const int n)
37 {
38 int nThreads = blockDim.x * gridDim.x;
39 int startID = blockIdx.x * blockDim.x + threadIdx.x;
40 for(int i = startID; i < n; i += nThreads) {
41 a[i] = (b[i] * c) + (d[i] * e);
42 }
43 }
44
45 /*
46 * 3-vector add. w/ coeff, a[i] = (b[i]*c) + (d[i]*e) + (f[i]*g), GPU kernel.
47 *
48 * \param a output array (LHS)
49 * \param b input array 1 (RHS)
50 * \param c input scalar 1 (RHS)
51 * \param d input array 2 (RHS)
52 * \param e input scalar 2 (RHS)
53 * \param f input array 3 (RHS)
54 * \param g input scalar 3 (RHS)
55 * \param n size of arrays
56 */
57 __global__ void _addVcVcVc(cudaReal* a, cudaReal const * b, cudaReal const c,
58 cudaReal const * d, cudaReal const e,
59 cudaReal const * f, cudaReal const g, const int n)
60 {
61 int nThreads = blockDim.x * gridDim.x;
62 int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for(int i = startID; i < n; i += nThreads) {
64 a[i] = (b[i] * c) + (d[i] * e) + (f[i] * g);
65 }
66 }
67
68 /*
69 * Vector addition in-place w/ coefficient, a[i] += b[i] * c, GPU kernel.
70 *
71 * \param a output array (LHS)
72 * \param b input array (RHS)
73 * \param c input scalar
74 * \param n size of arrays
75 */
76 __global__ void _addEqVc(cudaReal* a, cudaReal const * b,
77 cudaReal const c, const int n)
78 {
79 int nThreads = blockDim.x * gridDim.x;
80 int startID = blockIdx.x * blockDim.x + threadIdx.x;
81 for(int i = startID; i < n; i += nThreads) {
82 a[i] += b[i] * c;
83 }
84 }
85
86 /*
87 * Vector subtraction, a[i] = b[i] - c[i] - d, GPU kernel.
88 *
89 * \param a output array (LHS)
90 * \param b input array (RHS)
91 * \param c input array (RHS)
92 * \param d input scalar (RHS)
93 * \param n size of arrays
94 */
95 __global__ void _subVVS(cudaReal* a, cudaReal const * b,
96 cudaReal const * c, cudaReal const d, const int n)
97 {
98 int nThreads = blockDim.x * gridDim.x;
99 int startID = blockIdx.x * blockDim.x + threadIdx.x;
100 for(int i = startID; i < n; i += nThreads) {
101 a[i] = b[i] - c[i] - d;
102 }
103 }
104
105 /*
106 * Vector division in-place w/ coeff., a[i] /= (b[i] * c), GPU kernel.
107 *
108 * \param a output array (LHS)
109 * \param b input array (RHS)
110 * \param c input scalar (RHS)
111 * \param n size of arrays
112 */
113 __global__ void _divEqVc(cudaComplex* a, cudaReal const * b,
114 cudaReal const c, const int n)
115 {
116 int nThreads = blockDim.x * gridDim.x;
117 int startID = blockIdx.x * blockDim.x + threadIdx.x;
118 for(int i = startID; i < n; i += nThreads) {
119 a[i].x /= (b[i] * c);
120 a[i].y /= (b[i] * c);
121 }
122 }
123
124 /*
125 * Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c), GPU kernel.
126 *
127 * \param a output array (LHS)
128 * \param b input array (RHS)
129 * \param c input scalar
130 * \param n size of arrays
131 */
132 __global__ void _expVc(cudaReal* a, cudaReal const * b,
133 cudaReal const c, const int n)
134 {
135 int nThreads = blockDim.x * gridDim.x;
136 int startID = blockIdx.x * blockDim.x + threadIdx.x;
137 for(int i = startID; i < n; i += nThreads) {
138 a[i] = exp(b[i] * c);
139 }
140 }
141
142 /*
143 * Vector assignment in pairs, a1[i] = b[i] and a2[i] = b[i], CUDA kernel.
144 *
145 * \param a1 output array 1 (LHS)
146 * \param a2 output array 2 (LHS)
147 * \param s shared input array to be assigned to both a1 and a2
148 * \param n size of arrays
149 */
150 __global__ void _eqVPair(cudaReal* a1, cudaReal* a2,
151 cudaReal const * s, const int n)
152 {
153 int nThreads = blockDim.x * gridDim.x;
154 int startID = blockIdx.x * blockDim.x + threadIdx.x;
155 cudaReal input;
156 for (int i = startID; i < n; i += nThreads) {
157 input = s[i];
158 a1[i] = input;
159 a2[i] = input;
160 }
161 }
162
163 /*
164 * Vector multiplication in pairs, ax[i] = bx[i] * s[i], CUDA kernel.
165 *
166 * \param a1 output array 1 (LHS)
167 * \param a2 output array 2 (LHS)
168 * \param b1 input array 1 (RHS)
169 * \param b2 input array 2 (RHS)
170 * \param s shared input array to be multiplied by both a1 and a2
171 * \param n size of arrays
172 */
173 __global__ void _mulVVPair(cudaReal* a1, cudaReal * a2,
174 cudaReal const * b1, cudaReal const * b2,
175 cudaReal const * s, const int n)
176 {
177 int nThreads = blockDim.x * gridDim.x;
178 int startID = blockIdx.x * blockDim.x + threadIdx.x;
179 cudaReal input;
180 for (int i = startID; i < n; i += nThreads) {
181 input = s[i];
182 a1[i] = b1[i] * input;
183 a2[i] = b2[i] * input;
184 }
185 }
186
187 /*
188 * In-place vector multiplication in pairs, ax[i] *= s[i], CUDA kernel
189 *
190 * \param a1 output array 1 (LHS)
191 * \param a2 output array 2 (LHS)
192 * \param s shared input array to be multiplied by both a1 and a2
193 * \param n size of arrays
194 */
195 __global__ void _mulEqVPair(cudaReal* a1, cudaReal* a2,
196 cudaReal const * s, const int n)
197 {
198 int nThreads = blockDim.x * gridDim.x;
199 int startID = blockIdx.x * blockDim.x + threadIdx.x;
200 cudaReal input;
201 for (int i = startID; i < n; i += nThreads) {
202 input = s[i];
203 a1[i] *= input;
204 a2[i] *= input;
205 }
206 }
207
208 /*
209 * Add an undefined number of vectors pointwise, GPU kernel.
210 *
211 * The input const pointer 'vecs' points to an array of const pointers. In
212 * other words, this is an array of arrays, where each array is represented
213 * by its pointer. The size of vecs is nVecs, and the size of vecs[i] is
214 * n (if i < nVecs). These nVecs vectors will be added and the result will
215 * be stored in vector 'a'.
216 *
217 * \param a output array (LHS)
218 * \param vecs array of pointers to DeviceArrays to be added
219 * \param nVecs number of vectors to be added
220 * \param n size of arrays
221 */
222 __global__ void _addVMany(cudaReal* a, cudaReal const ** vecs,
223 const int nVecs, const int n)
224 {
225 int nThreads = blockDim.x * gridDim.x;
226 int startID = blockIdx.x * blockDim.x + threadIdx.x;
227 for (int i = startID; i < n; i += nThreads) {
228 cudaReal sum = vecs[0][i];
229 for (int j = 1; j < nVecs; j++) {
230 sum += vecs[j][i];
231 }
232 a[i] = sum;
233 }
234 }
235
236 /*
237 * Multiply an undefined number of vectors pointwise, GPU kernel.
238 *
239 * The input const pointer 'vecs' points to an array of const pointers. In
240 * other words, this is an array of arrays, where each array is represented
241 * by its pointer. The size of vecs is nVecs, and the size of vecs[i] is
242 * n (if i < nVecs). These nVecs vectors will be multiplied and the result
243 * will be stored in vector 'a'.
244 *
245 * \param a output array (LHS)
246 * \param vecs array of pointers to DeviceArrays to be multiplied
247 * \param nVecs number of vectors to be multiplied
248 * \param n size of arrays
249 */
250 __global__ void _mulVMany(cudaReal* a, cudaReal const ** vecs,
251 const int nVecs, const int n)
252 {
253 int nThreads = blockDim.x * gridDim.x;
254 int startID = blockIdx.x * blockDim.x + threadIdx.x;
255 for (int i = startID; i < n; i += nThreads) {
256 cudaReal prod = vecs[0][i];
257 for (int j = 1; j < nVecs; j++) {
258 prod *= vecs[j][i];
259 }
260 a[i] = prod;
261 }
262 }
263
264 /*
265 * Squared norm of complex number, a[i] = norm(b[i])^2, GPU kernel.
266 *
267 * \param a output array (LHS)
268 * \param b input array (RHS)
269 * \param n size of arrays
270 */
271 __global__ void _sqNormV(cudaReal* a, cudaComplex const * b, const int n)
272 {
273 int nThreads = blockDim.x * gridDim.x;
274 int startID = blockIdx.x * blockDim.x + threadIdx.x;
275 for(int i = startID; i < n; i += nThreads) {
276 a[i] = (b[i].x * b[i].x) + (b[i].y * b[i].y);
277 }
278 }
279
280 /*
281 * Norm of complex number to the 4th power, a[i] = norm(b[i])^4, GPU kernel.
282 *
283 * \param a output array (LHS)
284 * \param b input array (RHS)
285 * \param n size of arrays
286 */
287 __global__ void _sqSqNormV(cudaReal* a, cudaComplex const * b, const int n)
288 {
289 int nThreads = blockDim.x * gridDim.x;
290 int startID = blockIdx.x * blockDim.x + threadIdx.x;
291 cudaReal tmp;
292 for(int i = startID; i < n; i += nThreads) {
293 tmp = (b[i].x * b[i].x) + (b[i].y * b[i].y);
294 a[i] = tmp * tmp;
295 }
296 }
297
298} // End anonymous namespace
299
300
301// CUDA kernel wrappers:
302
303// Vector addition w/ coefficient, a[i] = (b[i]*c) + (d[i]*e), kernel wrapper.
305 DeviceArray<cudaReal> const & b, cudaReal const c,
306 DeviceArray<cudaReal> const & d, cudaReal const e)
307{
308 const int n = a.capacity();
309 UTIL_CHECK(b.capacity() >= n);
310 UTIL_CHECK(d.capacity() >= n);
311
312 // GPU resources
313 int nBlocks, nThreads;
314 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
315
316 // Launch kernel
317 _addVcVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, d.cArray(), e, n);
318 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
319}
320
321// 3-vector add. w/ coeff, a[i] = (b[i]*c) + (d[i]*e) + (f[i]*g), kernel wrapper
323 DeviceArray<cudaReal> const & b, cudaReal const c,
324 DeviceArray<cudaReal> const & d, cudaReal const e,
325 DeviceArray<cudaReal> const & f, cudaReal const g)
326{
327 const int n = a.capacity();
328 UTIL_CHECK(b.capacity() >= n);
329 UTIL_CHECK(d.capacity() >= n);
330 UTIL_CHECK(f.capacity() >= n);
331
332 // GPU resources
333 int nBlocks, nThreads;
334 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
335
336 // Launch kernel
337 _addVcVcVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, d.cArray(), e,
338 f.cArray(), g, n);
339 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
340}
341
342// Vector addition in-place w/ coefficient, a[i] += b[i] * c, kernel wrapper.
344 cudaReal const c)
345{
346 const int n = a.capacity();
347 UTIL_CHECK(b.capacity() >= n);
348
349 // GPU resources
350 int nBlocks, nThreads;
351 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
352
353 // Launch kernel
354 _addEqVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
355 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
356}
357
358// Vector subtraction, a[i] = b[i] - c[i] - d, kernel wrapper.
360 DeviceArray<cudaReal> const & c, cudaReal const d)
361{
362 const int n = a.capacity();
363 UTIL_CHECK(b.capacity() >= n);
364 UTIL_CHECK(c.capacity() >= n);
365
366 // GPU resources
367 int nBlocks, nThreads;
368 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
369
370 // Launch kernel
371 _subVVS<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(),
372 c.cArray(), d, n);
373 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
374}
375
376// Vector division in-place w/ coeff., a[i] /= (b[i] * c), kernel wrapper.
378 cudaReal const c)
379{
380 const int n = a.capacity();
381 UTIL_CHECK(b.capacity() >= n);
382
383 // GPU resources
384 int nBlocks, nThreads;
385 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
386
387 // Launch kernel
388 _divEqVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
389 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
390}
391
392// Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c), kernel wrapper.
394 cudaReal const c)
395{
396 const int n = a.capacity();
397 UTIL_CHECK(b.capacity() >= n);
398
399 // GPU resources
400 int nBlocks, nThreads;
401 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
402
403 // Launch kernel
404 _expVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
405 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
406}
407
408// Vector assignment in pairs, ax[i] = s[i], kernel wrapper.
410 DeviceArray<cudaReal> const & s)
411{
412 const int n = a1.capacity();
413 UTIL_CHECK(a2.capacity() == n);
414 UTIL_CHECK(s.capacity() >= n);
415
416 // GPU resources
417 int nBlocks, nThreads;
418 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
419
420 // Launch kernel
421 _eqVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(), s.cArray(), n);
422 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
423}
424
425// Vec. mul. in pairs, ax[i] = bx[i] * s[i], kernel wrapper.
427 DeviceArray<cudaReal> const & b1,
428 DeviceArray<cudaReal> const & b2,
429 DeviceArray<cudaReal> const & s)
430{
431 const int n = a1.capacity();
432 UTIL_CHECK(a2.capacity() == n);
433 UTIL_CHECK(b1.capacity() >= n);
434 UTIL_CHECK(b2.capacity() >= n);
435 UTIL_CHECK(s.capacity() >= n);
436
437 // GPU resources
438 int nBlocks, nThreads;
439 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
440
441 // Launch kernel
442 _mulVVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(), b1.cArray(),
443 b2.cArray(), s.cArray(), n);
444 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
445}
446
447// In-place vec. mul. in pairs, ax[i] *= s[i], kernel wrapper
449 DeviceArray<cudaReal> const & s)
450{
451 const int n = a1.capacity();
452 UTIL_CHECK(a2.capacity() == n);
453 UTIL_CHECK(s.capacity() >= n);
454
455 // GPU resources
456 int nBlocks, nThreads;
457 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
458
459 // Launch kernel
460 _mulEqVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(), s.cArray(), n);
461 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
462}
463
464// Add an undefined number of vectors pointwise, kernel wrapper.
466 DArray<DeviceArray<cudaReal> > const & vecs)
467{
468 int nVecs = vecs.capacity();
469 UTIL_CHECK(nVecs > 1);
470 int n = vecs[0].capacity();
471
472 if (nVecs == 2) {
473 addVV(a, vecs[0], vecs[1]);
474 return;
475 }
476
477 // Create array of pointers to arrays on host
478 HostDArray<cudaReal const *> vecs_h(nVecs);
479 for (int i = 0; i < nVecs; i++) {
480 vecs_h[i] = vecs[i].cArray();
481 }
482 DeviceArray<cudaReal const *> vecs_d(nVecs);
483 vecs_d = vecs_h; // transfer array of pointers to device
484
485 // GPU resources
486 int nBlocks, nThreads;
487 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
488
489 // Launch kernel
490 _addVMany<<<nBlocks, nThreads>>>(a.cArray(), vecs_d.cArray(), nVecs, n);
491 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
492}
493
494// Add an undefined number of vectors pointwise, kernel wrapper.
496 DArray<DeviceArray<cudaReal> const *> const & vecs)
497{
498 int nVecs = vecs.capacity();
499 UTIL_CHECK(nVecs > 1);
500 int n = vecs[0]->capacity();
501
502 if (nVecs == 2) {
503 addVV(a, *vecs[0], *vecs[1]);
504 return;
505 }
506
507 // Create array of pointers to arrays on host
508 HostDArray<cudaReal const *> vecs_h(nVecs);
509 for (int i = 0; i < nVecs; i++) {
510 vecs_h[i] = vecs[i]->cArray();
511 }
512 DeviceArray<cudaReal const *> vecs_d(nVecs);
513 vecs_d = vecs_h; // transfer array of pointers to device
514
515 // GPU resources
516 int nBlocks, nThreads;
517 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
518
519 // Launch kernel
520 _addVMany<<<nBlocks, nThreads>>>(a.cArray(), vecs_d.cArray(), nVecs, n);
521 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
522}
523
524// Multiply an undefined number of vectors pointwise, kernel wrapper.
526 DArray<DeviceArray<cudaReal> > const & vecs)
527{
528 int nVecs = vecs.capacity();
529 UTIL_CHECK(nVecs > 1);
530 int n = vecs[0].capacity();
531
532 if (nVecs == 2) {
533 mulVV(a, vecs[0], vecs[1]);
534 return;
535 }
536
537 // Create array of pointers to arrays on host
538 HostDArray<cudaReal const *> vecs_h(nVecs);
539 for (int i = 0; i < nVecs; i++) {
540 vecs_h[i] = vecs[i].cArray();
541 }
542 DeviceArray<cudaReal const *> vecs_d(nVecs);
543 vecs_d = vecs_h; // transfer array of pointers to device
544
545 // GPU resources
546 int nBlocks, nThreads;
547 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
548
549 // Launch kernel
550 _mulVMany<<<nBlocks, nThreads>>>(a.cArray(), vecs_d.cArray(), nVecs, n);
551 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
552}
553
554// Multiply an undefined number of vectors pointwise, kernel wrapper.
556 DArray<DeviceArray<cudaReal> const *> const & vecs)
557{
558 int nVecs = vecs.capacity();
559 UTIL_CHECK(nVecs > 1);
560 int n = vecs[0]->capacity();
561
562 if (nVecs == 2) {
563 mulVV(a, *vecs[0], *vecs[1]);
564 return;
565 }
566
567 // Create array of pointers to arrays on host
568 HostDArray<cudaReal const *> vecs_h(nVecs);
569 for (int i = 0; i < nVecs; i++) {
570 vecs_h[i] = vecs[i]->cArray();
571 }
572 DeviceArray<cudaReal const *> vecs_d(nVecs);
573 vecs_d = vecs_h; // transfer array of pointers to device
574
575 // GPU resources
576 int nBlocks, nThreads;
577 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
578
579 // Launch kernel
580 _mulVMany<<<nBlocks, nThreads>>>(a.cArray(), vecs_d.cArray(), nVecs, n);
581 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
582}
583
584// Squared norm of complex vector, a[i] = norm(b[i])^2, kernel wrapper.
586{
587 const int n = a.capacity();
588 UTIL_CHECK(b.capacity() >= n);
589
590 // GPU resources
591 int nBlocks, nThreads;
592 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
593
594 // Launch kernel
595 _sqNormV<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), n);
596 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
597}
598
599// Norm of complex number to the 4th power, a[i] = norm(b[i])^4, kernel wrapper.
601{
602 const int n = a.capacity();
603 UTIL_CHECK(b.capacity() >= n);
604
605 // GPU resources
606 int nBlocks, nThreads;
607 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
608
609 // Launch kernel
610 _sqSqNormV<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), n);
611 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
612}
613
614} // namespace VecOp
615} // namespace Cuda
616} // namespace Prdc
617} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition rpg/System.h:32
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Definition HostDArray.h:43
Data * cArray()
Return a pointer to the underlying C array.
Definition Array.h:214
Dynamically allocatable contiguous array template.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
cudaReal sum(DeviceArray< cudaReal > const &in)
Compute sum of array elements (GPU kernel wrapper).
Definition Reduce.cu:480
void addVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Add an undefined number of vectors pointwise, kernel wrapper.
Definition VecOpMisc.cu:465
void addVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1084
void addVcVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e, DeviceArray< cudaReal > const &f, cudaReal const g)
3-vec addition w coeff, a[i] = (b[i]*c) + (d[i]*e) + (f[i]*g), kernel wrapper.
Definition VecOpMisc.cu:322
void mulVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1376
void expVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c), kernel wrapper.
Definition VecOpMisc.cu:393
void sqSqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Norm of complex number to the 4th power, a[i] = norm(b[i])^4, kernel wrapper.
Definition VecOpMisc.cu:600
void mulVVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &b1, DeviceArray< cudaReal > const &b2, DeviceArray< cudaReal > const &s)
Vector multiplication in pairs, ax[i] = bx[i] * s[i], kernel wrapper.
Definition VecOpMisc.cu:426
void addVcVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, DeviceArray< cudaReal > const &d, cudaReal const e)
Vector addition w/ coefficient, a[i] = (b[i]*c) + (d[i]*e), kernel wrapper.
Definition VecOpMisc.cu:304
void mulVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Multiply an undefined number of vectors pointwise, kernel wrapper.
Definition VecOpMisc.cu:525
void subVVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, cudaReal const d)
Vector subtraction, a[i] = b[i] - c[i] - d, kernel wrapper.
Definition VecOpMisc.cu:359
void eqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
Vector assignment in pairs, ax[i] = b[i], kernel wrapper.
Definition VecOpMisc.cu:409
void sqNormV(DeviceArray< cudaReal > &a, DeviceArray< cudaComplex > const &b)
Squared norm of complex number, a[i] = norm(b[i])^2, kernel wrapper.
Definition VecOpMisc.cu:585
void divEqVc(DeviceArray< cudaComplex > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector division in-place w/ coeff., a[i] /= (b[i] * c), kernel wrapper.
Definition VecOpMisc.cu:377
void mulEqVPair(DeviceArray< cudaReal > &a1, DeviceArray< cudaReal > &a2, DeviceArray< cudaReal > const &s)
In-place vector multiplication in pairs, ax[i] *= s[i], kernel wrapper.
Definition VecOpMisc.cu:448
void addEqVc(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c)
Vector addition in-place w/ coefficient, a[i] += b[i] * c, kernel wrapper.
Definition VecOpMisc.cu:343
PSCF package top-level namespace.
Definition param_pc.dox:1