PSCF v1.4.0
VecOpMisc.cu
1/*
2* PSCF - Polymer Self-Consistent Field
3*
4* Copyright 2015 - 2025, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "VecOpMisc.h"
9#include "VecOp.h"
10#include <pscf/cuda/ThreadArray.h>
11#include <pscf/cuda/HostDArray.h>
12#include <pscf/cuda/cudaErrorCheck.h>
13#include <cmath>
14
15namespace Pscf {
16// namespace Prdc {
17// namespace Cuda {
18namespace VecOp {
19
20 // Anonymous namespace for CUDA kernels (only accessible in this file)
21 namespace {
22
23 /*
24 * Add scaled vectors, a[i] = (b[i]*c) + (d[i]*e).
25 *
26 * \param a real output array (LHS)
27 * \param b1 real input array 1 (RHS)
28 * \param c1 real coefficient of array 1 (RHS)
29 * \param b2 real input array 2 (RHS)
30 * \param c2 real coefficient of array 2 (RHS)
31 * \param n size of arrays
32 */
33 __global__
34 void _addVcVc(cudaReal* a,
35 cudaReal const * b1, cudaReal const c1,
36 cudaReal const * b2, cudaReal const c2,
37 const int n)
38 {
39 int nThreads = blockDim.x * gridDim.x;
40 int startID = blockIdx.x * blockDim.x + threadIdx.x;
41 for (int i = startID; i < n; i += nThreads) {
42 a[i] = (b1[i] * c1) + (b2[i] * c2);
43 }
44 }
45
46 /*
47 * Add a scaled vector and a scalar, a[i] = b[i]*c + s.
48 *
49 * \param a output array (LHS)
50 * \param b real input array (RHS)
51 * \param c real coefficient of array b (RHS)
52 * \param s real additive scalar (RHS)
53 * \param n size of arrays
54 */
55 __global__
56 void _addVcS(cudaReal* a,
57 cudaReal const * b, cudaReal const c,
58 cudaReal const s,
59 const int n)
60 {
61 int nThreads = blockDim.x * gridDim.x;
62 int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for (int i = startID; i < n; i += nThreads) {
64 a[i] = b[i] * c + s;
65 }
66 }
67
68 /*
69 * Add scaled vector in-place, a[i] += b[i] * c.
70 *
71 * \param a output array (LHS)
72 * \param b input array (RHS)
73 * \param c input scalar
74 * \param n size of arrays
75 */
76 __global__
77 void _addEqVc(cudaReal* a,
78 cudaReal const * b, cudaReal const c,
79 const int n)
80 {
81 int nThreads = blockDim.x * gridDim.x;
82 int startID = blockIdx.x * blockDim.x + threadIdx.x;
83 for (int i = startID; i < n; i += nThreads) {
84 a[i] += b[i] * c;
85 }
86 }
87
88 /*
89 * Add 3 scaled vectors, a[i] = b[i]*c + d[i]*e + f[i]*g.
90 *
91 * \param a output array (LHS)
92 * \param b1 input array 1 (RHS)
93 * \param c1 input scalar 1 (RHS)
94 * \param b2 input array 2 (RHS)
95 * \param c2 input scalar 2 (RHS)
96 * \param b3 input array 3 (RHS)
97 * \param c3 input scalar 3 (RHS)
98 * \param n size of arrays
99 */
100 __global__
101 void _addVcVcVc(cudaReal* a,
102 cudaReal const * b1, cudaReal const c1,
103 cudaReal const * b2, cudaReal const c2,
104 cudaReal const * b3, cudaReal const c3,
105 const int n)
106 {
107 int nThreads = blockDim.x * gridDim.x;
108 int startID = blockIdx.x * blockDim.x + threadIdx.x;
109 for (int i = startID; i < n; i += nThreads) {
110 a[i] = (b1[i] * c1) + (b2[i] * c2) + (b3[i] * c3);
111 }
112 }
113
114 /*
115 * Add scaled vectors and scalar, a[i] = b1[i]*c1 + b2[i]*c2 + s.
116 *
117 * \param a real output array (LHS)
118 * \param b1 real input array 1 (RHS)
119 * \param c1 real coefficient of b1 (RHS)
120 * \param b2 real input array 2 (RHS)
121 * \param c2 real coefficient of b2 (RHS)
122 * \param s real scalar summand (RHS)
123 * \param n size of arrays
124 */
125 __global__
126 void _addVcVcS(cudaReal* a,
127 cudaReal const * b1, cudaReal const c1,
128 cudaReal const * b2, cudaReal const c2,
129 const cudaReal s, const int n)
130 {
131 int nThreads = blockDim.x * gridDim.x;
132 int startID = blockIdx.x * blockDim.x + threadIdx.x;
133 for (int i = startID; i < n; i += nThreads) {
134 a[i] = (b1[i] * c1) + (b2[i] * c2) + s;
135 }
136 }
137
138 // In-place addition of scaled vectors
139
140 #if 0
141 /*
142 * Vector subtraction, a[i] = b[i] - c[i] - d.
143 *
144 * \param a output array (LHS)
145 * \param b input array (RHS)
146 * \param c input array (RHS)
147 * \param d input scalar (RHS)
148 * \param n size of arrays
149 */
150 __global__
151 void _subVVS(cudaReal* a,
152 cudaReal const * b,
153 cudaReal const * c,
154 cudaReal const d,
155 const int n)
156 {
157 int nThreads = blockDim.x * gridDim.x;
158 int startID = blockIdx.x * blockDim.x + threadIdx.x;
159 for (int i = startID; i < n; i += nThreads) {
160 a[i] = b[i] - c[i] - d;
161 }
162 }
163 #endif
164
165 /*
166 * Vector division in-place w/ coeff., a[i] /= (b[i] * c).
167 *
168 * \param a output array (LHS)
169 * \param b input array (RHS)
170 * \param c input scalar (RHS)
171 * \param n size of arrays
172 */
173 __global__
174 void _divEqVc(cudaComplex* a,
175 cudaReal const * b,
176 cudaReal const c,
177 const int n)
178 {
179 int nThreads = blockDim.x * gridDim.x;
180 int startID = blockIdx.x * blockDim.x + threadIdx.x;
181 for (int i = startID; i < n; i += nThreads) {
182 a[i].x /= (b[i] * c);
183 a[i].y /= (b[i] * c);
184 }
185 }
186
187 /*
188 * Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c).
189 *
190 * \param a output array (LHS)
191 * \param b input array (RHS)
192 * \param c input scalar
193 * \param n size of arrays
194 */
195 __global__
196 void _expVc(cudaReal* a,
197 cudaReal const * b,
198 cudaReal const c,
199 const int n)
200 {
201 int nThreads = blockDim.x * gridDim.x;
202 int startID = blockIdx.x * blockDim.x + threadIdx.x;
203 for (int i = startID; i < n; i += nThreads) {
204 a[i] = exp(b[i] * c);
205 }
206 }
207
208 // Pair operations
209
210 /*
211 * Vector assignment in pairs, a1[i] = b[i] and a2[i] = b[i].
212 *
213 * \param a1 output array 1 (LHS)
214 * \param a2 output array 2 (LHS)
215 * \param b shared input array assigned to both a1 and a2 (RHS)
216 * \param n size of arrays
217 */
218 __global__
219 void _eqVPair(cudaReal* a1, cudaReal* a2,
220 cudaReal const * b, const int n)
221 {
222 int nThreads = blockDim.x * gridDim.x;
223 int startID = blockIdx.x * blockDim.x + threadIdx.x;
224 cudaReal input;
225 for (int i = startID; i < n; i += nThreads) {
226 input = b[i];
227 a1[i] = input;
228 a2[i] = input;
229 }
230 }
231
232 /*
233 * Vector multiplication in pairs, ax[i] = bx[i] * s[i].
234 *
235 * \param a1 output array 1 (LHS)
236 * \param a2 output array 2 (LHS)
237 * \param b1 input array 1 (RHS)
238 * \param b2 input array 2 (RHS)
239 * \param s shared input array to be multiplied by both a1 and a2
240 * \param n size of arrays
241 */
242 __global__
243 void _mulVVPair(cudaReal* a1, cudaReal * a2,
244 cudaReal const * b1, cudaReal const * b2,
245 cudaReal const * s, const int n)
246 {
247 int nThreads = blockDim.x * gridDim.x;
248 int startID = blockIdx.x * blockDim.x + threadIdx.x;
249 cudaReal input;
250 for (int i = startID; i < n; i += nThreads) {
251 input = s[i];
252 a1[i] = b1[i] * input;
253 a2[i] = b2[i] * input;
254 }
255 }
256
257 /*
258 * In-place vector-scalar multiplication in pairs, ax[i] *= s[i].
259 *
260 * \param a1 output array 1 (LHS)
261 * \param a2 output array 2 (LHS)
262 * \param b shared input array to multiplied both a1 and a2
263 * \param n size of arrays
264 */
265 __global__
266 void _mulEqVPair(cudaReal* a1, cudaReal* a2,
267 cudaReal const * b, const int n)
268 {
269 int nThreads = blockDim.x * gridDim.x;
270 int startID = blockIdx.x * blockDim.x + threadIdx.x;
271 cudaReal input;
272 for (int i = startID; i < n; i += nThreads) {
273 input = b[i];
274 a1[i] *= input;
275 a2[i] *= input;
276 }
277 }
278
279 /*
280 * Add an undefined number of vectors pointwise.
281 *
282 * The input const pointer 'vecs' points to an array of const pointers.
283 * In other words, this is an array of arrays, where each array is
284 * represented by its pointer. The size of vecs is nVecs, and the size
285 * of vecs[i] is n (if i < nVecs). These nVecs vectors will be added
286 * and the result will be stored in vector 'a'.
287 *
288 * \param a output array (LHS)
289 * \param vecs array of pointers to DeviceArrays to be added
290 * \param nVecs number of vectors to be added
291 * \param n size of arrays
292 */
293 __global__
294 void _addVMany(cudaReal* a,
295 cudaReal const ** vecs,
296 const int nVecs, const int n)
297 {
298 int nThreads = blockDim.x * gridDim.x;
299 int startID = blockIdx.x * blockDim.x + threadIdx.x;
300 for (int i = startID; i < n; i += nThreads) {
301 cudaReal sum = vecs[0][i];
302 for (int j = 1; j < nVecs; j++) {
303 sum += vecs[j][i];
304 }
305 a[i] = sum;
306 }
307 }
308
309 /*
310 * Multiply an undefined number of vectors pointwise.
311 *
312 * The input pointer 'vecs' points to an array of const pointers.
313 * In other words, this is an array of arrays, where each array is
314 * represented by its pointer. The size of vecs is nVecs, and the
315 * size of vecs[i] is n (if i < nVecs). These nVecs vectors will
316 * be multiplied and the result will be stored in vector 'a'.
317 *
318 * \param a output array (LHS)
319 * \param vecs array of pointers to DeviceArrays to be multiplied
320 * \param nVecs number of vectors to be multiplied
321 * \param n size of arrays
322 */
323 __global__
324 void _mulVMany(cudaReal* a,
325 cudaReal const ** vecs,
326 const int nVecs, const int n)
327 {
328 int nThreads = blockDim.x * gridDim.x;
329 int startID = blockIdx.x * blockDim.x + threadIdx.x;
330 for (int i = startID; i < n; i += nThreads) {
331 cudaReal prod = vecs[0][i];
332 for (int j = 1; j < nVecs; j++) {
333 prod *= vecs[j][i];
334 }
335 a[i] = prod;
336 }
337 }
338
339 /*
340 * Fourth power of magnitude of complex number, a[i] = |b[i]|^4.
341 *
342 * \param a output array (LHS)
343 * \param b input array (RHS)
344 * \param n size of arrays
345 */
346 __global__
347 void _sqSqAbsV(cudaReal* a, cudaComplex const * b, const int n)
348 {
349 int nThreads = blockDim.x * gridDim.x;
350 int startID = blockIdx.x * blockDim.x + threadIdx.x;
351 cudaReal tmp;
352 for (int i = startID; i < n; i += nThreads) {
353 tmp = (b[i].x * b[i].x) + (b[i].y * b[i].y);
354 a[i] = tmp * tmp;
355 }
356 }
357
358 } // End anonymous namespace
359
360 // Linear combinations - addition of scaled vectors
361
362 /*
363 * Add two scaled vectors, a[i] = b1[i]*c1 + b2[i]*c2.
364 */
366 DeviceArray<cudaReal> const & b1, cudaReal const c1,
367 DeviceArray<cudaReal> const & b2, cudaReal const c2)
368 {
369 const int n = a.capacity();
370 UTIL_CHECK(b1.capacity() >= n);
371 UTIL_CHECK(b2.capacity() >= n);
372
373 // GPU resources
374 int nBlocks, nThreads;
375 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
376
377 // Launch kernel
378 _addVcVc<<<nBlocks, nThreads>>>(a.cArray(),
379 b1.cArray(), c1,
380 b2.cArray(), c2, n);
381 cudaErrorCheck( cudaGetLastError() );
382 }
383
384 /*
385 * Add a scaled vector and a scalar, a[i] = b[i]*c + s.
386 */
388 DeviceArray<cudaReal> const & b, cudaReal const c,
389 cudaReal const s)
390 {
391 const int n = a.capacity();
392 UTIL_CHECK(b.capacity() >= n);
393
394 // GPU resources
395 int nBlocks, nThreads;
396 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
397
398 // Launch kernel
399 _addVcS<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, s, n);
400 cudaErrorCheck( cudaGetLastError() );
401 }
402
403 /*
404 * Add 3 scaled vectors, a[i] = b1[i]*c1 + b2[i]*c2 + b3[i]*c3).
405 */
407 DeviceArray<cudaReal> const & b1, cudaReal const c1,
408 DeviceArray<cudaReal> const & b2, cudaReal const c2,
409 DeviceArray<cudaReal> const & b3, cudaReal const c3)
410 {
411 const int n = a.capacity();
412 UTIL_CHECK(b1.capacity() >= n);
413 UTIL_CHECK(b2.capacity() >= n);
414 UTIL_CHECK(b3.capacity() >= n);
415
416 // GPU resources
417 int nBlocks, nThreads;
418 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
419
420 // Launch kernel
421 _addVcVcVc<<<nBlocks, nThreads>>>(a.cArray(),
422 b1.cArray(), c1,
423 b2.cArray(), c2,
424 b3.cArray(), c3, n);
425 cudaErrorCheck( cudaGetLastError() );
426 }
427
428 /*
429 * Add 2 scaled vectors and scalar, a[i] = b1[i]*c1 + b2[i]*c2 + s;
430 */
432 DeviceArray<cudaReal> const & b1, cudaReal const c1,
433 DeviceArray<cudaReal> const & b2, cudaReal const c2,
434 cudaReal const s)
435 {
436 const int n = a.capacity();
437 UTIL_CHECK(b1.capacity() >= n);
438 UTIL_CHECK(b2.capacity() >= n);
439
440 // GPU resources
441 int nBlocks, nThreads;
442 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
443
444 // Launch kernel
445 _addVcVcS<<<nBlocks, nThreads>>>(a.cArray(),
446 b1.cArray(), c1,
447 b2.cArray(), c2,
448 s, n);
449 cudaErrorCheck( cudaGetLastError() );
450 }
451
452 // Compound (in-place) addition of scaled vectors.
453
454 /*
455 * Add scaled vector in-place, a[i] += b[i] * c.
456 */
458 DeviceArray<cudaReal> const & b,
459 cudaReal const c)
460 {
461 const int n = a.capacity();
462 UTIL_CHECK(b.capacity() >= n);
463
464 // GPU resources
465 int nBlocks, nThreads;
466 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
467
468 // Launch kernel
469 _addEqVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
470 cudaErrorCheck( cudaGetLastError() );
471 }
472
473 /*
474 * Vector in-place division by scaled vector, a[i] /= (b[i] * c).
475 */
477 DeviceArray<cudaReal> const & b, cudaReal const c)
478 {
479 const int n = a.capacity();
480 UTIL_CHECK(b.capacity() >= n);
481
482 // GPU resources
483 int nBlocks, nThreads;
484 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
485
486 // Launch kernel
487 _divEqVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
488 cudaErrorCheck( cudaGetLastError() );
489 }
490
491 /*
492 * Vector exponentiation w/ coefficient, a[i] = exp(b[i]*c).
493 */
495 DeviceArray<cudaReal> const & b, cudaReal const c)
496 {
497 const int n = a.capacity();
498 UTIL_CHECK(b.capacity() >= n);
499
500 // GPU resources
501 int nBlocks, nThreads;
502 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
503
504 // Launch kernel
505 _expVc<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), c, n);
506 cudaErrorCheck( cudaGetLastError() );
507 }
508
509 /*
510 * Vector assignment in pairs, ax[i] = s[i].
511 */
513 DeviceArray<cudaReal> const & s)
514 {
515 const int n = a1.capacity();
516 UTIL_CHECK(a2.capacity() == n);
517 UTIL_CHECK(s.capacity() >= n);
518
519 // GPU resources
520 int nBlocks, nThreads;
521 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
522
523 // Launch kernel
524 _eqVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(),
525 s.cArray(), n);
526 cudaErrorCheck( cudaGetLastError() );
527 }
528
529 /*
530 * Vec. mul. in pairs, ax[i] = bx[i] * s[i].
531 */
533 DeviceArray<cudaReal> const & b1,
534 DeviceArray<cudaReal> const & b2,
535 DeviceArray<cudaReal> const & s)
536 {
537 const int n = a1.capacity();
538 UTIL_CHECK(a2.capacity() == n);
539 UTIL_CHECK(b1.capacity() >= n);
540 UTIL_CHECK(b2.capacity() >= n);
541 UTIL_CHECK(s.capacity() >= n);
542
543 // GPU resources
544 int nBlocks, nThreads;
545 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
546
547 // Launch kernel
548 _mulVVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(),
549 b1.cArray(), b2.cArray(),
550 s.cArray(), n);
551 cudaErrorCheck( cudaGetLastError() );
552 }
553
554 /*
555 * In-place vec. mul. in pairs, ax[i] *= s[i].
556 */
558 DeviceArray<cudaReal> const & s)
559 {
560 const int n = a1.capacity();
561 UTIL_CHECK(a2.capacity() == n);
562 UTIL_CHECK(s.capacity() >= n);
563
564 // GPU resources
565 int nBlocks, nThreads;
566 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
567
568 // Launch kernel
569 _mulEqVPair<<<nBlocks, nThreads>>>(a1.cArray(), a2.cArray(),
570 s.cArray(), n);
571 cudaErrorCheck( cudaGetLastError() );
572 }
573
574 /*
575 * Add an undefined number of vectors pointwise.
576 */
578 DArray<DeviceArray<cudaReal> > const & vecs)
579 {
580 int nVecs = vecs.capacity();
581 UTIL_CHECK(nVecs > 1);
582 int n = vecs[0].capacity();
583
584 if (nVecs == 2) {
585 addVV(a, vecs[0], vecs[1]);
586 return;
587 }
588
589 // Create array of pointers to arrays on host
590 HostDArray<cudaReal const *> vecs_h(nVecs);
591 for (int i = 0; i < nVecs; i++) {
592 vecs_h[i] = vecs[i].cArray();
593 }
594 DeviceArray<cudaReal const *> vecs_d(nVecs);
595 vecs_d = vecs_h; // transfer array of pointers to device
596
597 // GPU resources
598 int nBlocks, nThreads;
599 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
600
601 // Launch kernel
602 _addVMany<<<nBlocks, nThreads>>>(a.cArray(),
603 vecs_d.cArray(), nVecs, n);
604 cudaErrorCheck( cudaGetLastError() );
605 }
606
607 /*
608 * Add an undefined number of vectors pointwise.
609 */
611 DArray<DeviceArray<cudaReal> const *> const & vecs)
612 {
613 int nVecs = vecs.capacity();
614 UTIL_CHECK(nVecs > 1);
615 int n = vecs[0]->capacity();
616
617 if (nVecs == 2) {
618 addVV(a, *vecs[0], *vecs[1]);
619 return;
620 }
621
622 // Create array of pointers to arrays on host
623 HostDArray<cudaReal const *> vecs_h(nVecs);
624 for (int i = 0; i < nVecs; i++) {
625 vecs_h[i] = vecs[i]->cArray();
626 }
627 DeviceArray<cudaReal const *> vecs_d(nVecs);
628 vecs_d = vecs_h; // transfer array of pointers to device
629
630 // GPU resources
631 int nBlocks, nThreads;
632 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
633
634 // Launch kernel
635 _addVMany<<<nBlocks, nThreads>>>(a.cArray(),
636 vecs_d.cArray(), nVecs, n);
637 cudaErrorCheck( cudaGetLastError() );
638 }
639
640 /*
641 * Multiply an undefined number of vectors pointwise.
642 */
644 DArray<DeviceArray<cudaReal> > const & vecs)
645 {
646 int nVecs = vecs.capacity();
647 UTIL_CHECK(nVecs > 1);
648 int n = vecs[0].capacity();
649
650 if (nVecs == 2) {
651 mulVV(a, vecs[0], vecs[1]);
652 return;
653 }
654
655 // Create array of pointers to arrays on host
656 HostDArray<cudaReal const *> vecs_h(nVecs);
657 for (int i = 0; i < nVecs; i++) {
658 vecs_h[i] = vecs[i].cArray();
659 }
660 DeviceArray<cudaReal const *> vecs_d(nVecs);
661 vecs_d = vecs_h; // transfer array of pointers to device
662
663 // GPU resources
664 int nBlocks, nThreads;
665 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
666
667 // Launch kernel
668 _mulVMany<<<nBlocks, nThreads>>>(a.cArray(), vecs_d.cArray(),
669 nVecs, n);
670 cudaErrorCheck( cudaGetLastError() );
671 }
672
673 /*
674 * Multiply an undefined number of vectors pointwise.
675 */
677 DArray<DeviceArray<cudaReal> const *> const & vecs)
678 {
679 int nVecs = vecs.capacity();
680 UTIL_CHECK(nVecs > 1);
681 int n = vecs[0]->capacity();
682
683 if (nVecs == 2) {
684 mulVV(a, *vecs[0], *vecs[1]);
685 return;
686 }
687
688 // Create array of pointers to arrays on host
689 HostDArray<cudaReal const *> vecs_h(nVecs);
690 for (int i = 0; i < nVecs; i++) {
691 vecs_h[i] = vecs[i]->cArray();
692 }
693 DeviceArray<cudaReal const *> vecs_d(nVecs);
694 vecs_d = vecs_h; // transfer array of pointers to device
695
696 // GPU resources
697 int nBlocks, nThreads;
698 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
699
700 // Launch kernel
701 _mulVMany<<<nBlocks, nThreads>>>(a.cArray(),
702 vecs_d.cArray(), nVecs, n);
703 cudaErrorCheck( cudaGetLastError() );
704 }
705
706 /*
707 * Fourth power of magnitude of complex vector, a[i] = |b[i]|^4.
708 */
710 DeviceArray<cudaComplex> const & b)
711 {
712 const int n = a.capacity();
713 UTIL_CHECK(b.capacity() >= n);
714
715 // GPU resources
716 int nBlocks, nThreads;
717 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
718
719 // Launch kernel
720 _sqSqAbsV<<<nBlocks, nThreads>>>(a.cArray(), b.cArray(), n);
721 cudaErrorCheck( cudaGetLastError() );
722 }
723
724} // namespace VecOp
725} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition DeviceArray.h:96
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Definition HostDArray.h:41
Data * cArray()
Return a pointer to the underlying C array.
Definition Array.h:199
Dynamically allocatable contiguous array template.
Definition DArray.h:32
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
double sum(Array< double > const &in)
Compute sum of array elements (real).
Definition Reduce.cpp:20
void sqSqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Fourth power of absolute magnitude, a[i] = |b[i]|^4 (complex).
Definition VecOpCx.cpp:714
void expVc(Array< double > &a, Array< double > const &b, const double c)
Exponentiation a scaled vector, a[i] = exp(b[i]*c) (real).
Definition VecOp.cpp:319
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
Definition VecOp.cpp:125
void addVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector addition, a[i] = b[i] + c[i] (real)
Definition VecOp.cpp:64
void divEqVc(Array< fftw_complex > &a, Array< double > const &b, double c)
Vector division in-place w/ coeff., a[i] /= (b[i] * c).
Definition VecOpCx.cpp:730
void addEqVc(Array< double > &a, Array< double > const &b, const double c)
Add scaled vector in-place, a[i] += b[i]*c (real).
Definition VecOp.cpp:395
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
void mulVVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b1, Array< double > const &b2, Array< double > const &c)
Vector multiplication in pairs, ax[i] = bx[i] * s[i], x=1,2.
Definition VecOp.cpp:463
void eqVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b)
Vector assignment in pairs, ax[i] = b[i], x = 1, 2.
Definition VecOp.cpp:446
void addVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Add an arbitrary number of vectors pointwise (real).
Definition VecOpMisc.cu:577
void mulVMany(DeviceArray< cudaReal > &a, DArray< DeviceArray< cudaReal > > const &vecs)
Multiply an undefined number of vectors pointwise (real).
Definition VecOpMisc.cu:643
void mulEqVPair(Array< double > &a1, Array< double > &a2, Array< double > const &b)
In-place vector multiplication in pairs, ax[i] *= b[i], x=1,2.
Definition VecOp.cpp:484
Vector operations on GPU or CPU.
Definition VecOp.cpp:14
void addVcVcS(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2, const double s)
Add scaled vectors + scalar, a[i] = b1[i]*c1 + b2[2]*c2 + s (real).
Definition VecOp.cpp:409
void addVcS(Array< double > &a, Array< double > const &b, const double c, const double s)
Add a scaled vector and a scalar, a[i] = b[i]*c + s (real).
Definition VecOp.cpp:380
void addVcVc(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2)
Add two scaled vectors, a[i] = b1[i]*c1 + b2[2]*c2 (real).
Definition VecOp.cpp:364
void addVcVcVc(Array< double > &a, Array< double > const &b1, const double c1, Array< double > const &b2, const double c2, Array< double > const &b3, const double c3)
Add scaled vectors, a[i] = b1[i]*c1 + b2[i]*c2 + b3[i]*c3 (real).
Definition VecOp.cpp:426
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
Definition cudaTypes.h:22
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.
Definition cudaTypes.h:35