PSCF v1.4.0
VecOp.cu
1/*
2* PSCF - Polymer Self-Consistent Field
3*
4* Copyright 2015 - 2025, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "VecOp.h"
9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/cudaErrorCheck.h>
11#include <cmath>
12
13namespace Pscf {
14namespace VecOp {
15
16// CUDA kernels:
17// (defined in anonymous namespace, used within this file only)
18
19namespace {
20
21 #if 0
22 /*
23 * Vector assignment, a[i] = b[i] (real).
24 *
25 * \param a real array (LHS)
26 * \param b real array (RHS)
27 * \param n size of arrays
28 */
29 __global__
30 void _eqV(cudaReal* a, cudaReal const * b, const int n)
31 {
32 int nThreads = blockDim.x * gridDim.x;
33 int startID = blockIdx.x * blockDim.x + threadIdx.x;
34 for (int i = startID; i < n; i += nThreads) {
35 a[i] = b[i];
36 }
37 }
38 #endif
39
40 /*
41 * Vector assignment, a[i] = b[i] (complex).
42 *
43 * \param a complex array (LHS)
44 * \param b complex array (RHS)
45 * \param n size of arrays
46 */
47 __global__
48 void _eqV(cudaComplex* a, cudaComplex const * b, const int n)
49 {
50 int nThreads = blockDim.x * gridDim.x;
51 int startID = blockIdx.x * blockDim.x + threadIdx.x;
52 for (int i = startID; i < n; i += nThreads) {
53 a[i].x = b[i].x;
54 a[i].y = b[i].y;
55 }
56 }
57
58 /*
59 * Vector assignment, a[j] = (b[j], c[j]) (complex, real & imaginary).
60 *
61 * \param a complex array (LHS)
62 * \param b real array, real part (RHS)
63 * \param c real array, imaginary part (RHS)
64 * \param n size of arrays
65 */
66 __global__
67 void _eqV(cudaComplex* a,
68 cudaReal const * b, cudaReal const * c,
69 const int n)
70 {
71 int nThreads = blockDim.x * gridDim.x;
72 int startID = blockIdx.x * blockDim.x + threadIdx.x;
73 for (int i = startID; i < n; i += nThreads) {
74 a[i].x = b[i];
75 a[i].y = c[i];
76 }
77 }
78
79 /*
80 * Vector assignment, a[i] = b (real).
81 *
82 * \param a real array (LHS)
83 * \param b real scalar (RHS)
84 * \param n size of arrays
85 */
86 __global__
87 void _eqS(cudaReal* a, const cudaReal b, const int n)
88 {
89 int nThreads = blockDim.x * gridDim.x;
90 int startID = blockIdx.x * blockDim.x + threadIdx.x;
91 for (int i = startID; i < n; i += nThreads) {
92 a[i] = b;
93 }
94 }
95
96 /*
97 * Vector assignment, a[i] = b(complex).
98 *
99 * \param a complex array (LHS)
100 * \param b complex scalar (RHS)
101 * \param n size of arrays
102 */
103 __global__
104 void _eqS(cudaComplex* a, const cudaComplex b, const int n)
105 {
106 int nThreads = blockDim.x * gridDim.x;
107 int startID = blockIdx.x * blockDim.x + threadIdx.x;
108 for (int i = startID; i < n; i += nThreads) {
109 a[i].x = b.x;
110 a[i].y = b.y;
111 }
112 }
113
114 /*
115 * Vector addition, a[i] = b[i] + c[i] (real).
116 *
117 * \param a real array (LHS)
118 * \param b real array (RHS)
119 * \param c real array (RHS)
120 * \param n size of arrays
121 */
122 __global__
123 void _addVV(cudaReal* a,
124 cudaReal const * b,
125 cudaReal const * c, const int n)
126 {
127 int nThreads = blockDim.x * gridDim.x;
128 int startID = blockIdx.x * blockDim.x + threadIdx.x;
129 for (int i = startID; i < n; i += nThreads) {
130 a[i] = b[i] + c[i];
131 }
132 }
133
134 /*
135 * Vector addition, a[i] = b[i] + c[i](complex).
136 *
137 * \param a output array (LHS)
138 * \param b input array (RHS)
139 * \param c input array (RHS)
140 * \param n size of arrays
141 */
142 __global__
143 void _addVV(cudaComplex* a,
144 cudaComplex const * b,
145 cudaComplex const * c, const int n)
146 {
147 int nThreads = blockDim.x * gridDim.x;
148 int startID = blockIdx.x * blockDim.x + threadIdx.x;
149 for (int i = startID; i < n; i += nThreads) {
150 a[i].x = b[i].x + c[i].x;
151 a[i].y = b[i].y + c[i].y;
152 }
153 }
154
155 /*
156 * Vector addition, a[i] = b[i] + c[i] (mixed, b real).
157 *
158 * \param a complex array (LHS)
159 * \param b real array (RHS)
160 * \param c complex array (RHS)
161 * \param n size of arrays
162 */
163 __global__
164 void _addVV(cudaComplex* a,
165 cudaReal const * b,
166 cudaComplex const * c, const int n)
167 {
168 int nThreads = blockDim.x * gridDim.x;
169 int startID = blockIdx.x * blockDim.x + threadIdx.x;
170 for (int i = startID; i < n; i += nThreads) {
171 a[i].x = b[i] + c[i].x;
172 a[i].y = c[i].y;
173 }
174 }
175
176 /*
177 * Vector addition, a[i] = b[i] + c[i] (mixed, c real).
178 *
179 * \param a complex array (LHS)
180 * \param b complex array (RHS)
181 * \param c real array (RHS)
182 * \param n size of arrays
183 */
184 __global__
185 void _addVV(cudaComplex* a,
186 cudaComplex const * b,
187 cudaReal const * c, const int n)
188 {
189 int nThreads = blockDim.x * gridDim.x;
190 int startID = blockIdx.x * blockDim.x + threadIdx.x;
191 for (int i = startID; i < n; i += nThreads) {
192 a[i].x = b[i].x + c[i];
193 a[i].y = b[i].y;
194 }
195 }
196
197 /*
198 * Vector addition, a[i] = b[i] + c (real).
199 *
200 * \param a output array (LHS)
201 * \param b input array (RHS)
202 * \param c input scalar (RHS)
203 * \param n size of arrays
204 */
205 __global__
206 void _addVS(cudaReal* a,
207 cudaReal const * b,
208 const cudaReal c, const int n)
209 {
210 int nThreads = blockDim.x * gridDim.x;
211 int startID = blockIdx.x * blockDim.x + threadIdx.x;
212 for (int i = startID; i < n; i += nThreads) {
213 a[i] = b[i] + c;
214 }
215 }
216
217 /*
218 * Vector addition, a[i] = b[i] + c(complex).
219 *
220 * \param a output array (LHS)
221 * \param b input array (RHS)
222 * \param c input scalar (RHS)
223 * \param n size of arrays
224 */
225 __global__
226 void _addVS(cudaComplex* a,
227 cudaComplex const * b,
228 const cudaComplex c, const int n)
229 {
230 int nThreads = blockDim.x * gridDim.x;
231 int startID = blockIdx.x * blockDim.x + threadIdx.x;
232 for (int i = startID; i < n; i += nThreads) {
233 a[i].x = b[i].x + c.x;
234 a[i].y = b[i].y + c.y;
235 }
236 }
237
238 /*
239 * Vector addition, a[i] = b[i] + c (mixed, b real).
240 *
241 * \param a complex array (LHS)
242 * \param b real array (RHS)
243 * \param c complex scalar (RHS)
244 * \param n size of arrays
245 */
246 __global__
247 void _addVS(cudaComplex* a,
248 cudaReal const * b,
249 const cudaComplex c, const int n)
250 {
251 int nThreads = blockDim.x * gridDim.x;
252 int startID = blockIdx.x * blockDim.x + threadIdx.x;
253 for (int i = startID; i < n; i += nThreads) {
254 a[i].x = b[i] + c.x;
255 a[i].y = c.y;
256 }
257 }
258
259 /*
260 * Vector-scalar addition, a[i] = b[i] + c (mixed).
261 *
262 * \param a complex array (LHS)
263 * \param b complex array (RHS)
264 * \param c real scalar (RHS)
265 * \param n size of arrays
266 */
267 __global__
268 void _addVS(cudaComplex* a,
269 cudaComplex const * b,
270 const cudaReal c, const int n)
271 {
272 int nThreads = blockDim.x * gridDim.x;
273 int startID = blockIdx.x * blockDim.x + threadIdx.x;
274 for (int i = startID; i < n; i += nThreads) {
275 a[i].x = b[i].x + c;
276 a[i].y = b[i].y;
277 }
278 }
279
280 /*
281 * Vector subtraction, a[i] = b[i] - c[i] (real).
282 *
283 * \param a real array (LHS)
284 * \param b real array (RHS)
285 * \param c real array (RHS)
286 * \param n size of arrays
287 */
288 __global__
289 void _subVV(cudaReal* a,
290 cudaReal const * b,
291 cudaReal const * c, const int n)
292 {
293 int nThreads = blockDim.x * gridDim.x;
294 int startID = blockIdx.x * blockDim.x + threadIdx.x;
295 for (int i = startID; i < n; i += nThreads) {
296 a[i] = b[i] - c[i];
297 }
298 }
299
300 /*
301 * Vector subtraction, a[i] = b[i] - c[i](complex).
302 *
303 * \param a output array (LHS)
304 * \param b input array (RHS)
305 * \param c input array (RHS)
306 * \param n size of arrays
307 */
308 __global__ void _subVV(cudaComplex* a, cudaComplex const * b,
309 cudaComplex const * c, const int n)
310 {
311 int nThreads = blockDim.x * gridDim.x;
312 int startID = blockIdx.x * blockDim.x + threadIdx.x;
313 for (int i = startID; i < n; i += nThreads) {
314 a[i].x = b[i].x - c[i].x;
315 a[i].y = b[i].y - c[i].y;
316 }
317 }
318
319 /*
320 * Vector subtraction, a[i] = b[i] - c[i], (mixed, b real).
321 *
322 * \param a output array (LHS)
323 * \param b input array (RHS)
324 * \param c input array (RHS)
325 * \param n size of arrays
326 */
327 __global__ void _subVV(cudaComplex* a, cudaReal const * b,
328 cudaComplex const * c, const int n)
329 {
330 int nThreads = blockDim.x * gridDim.x;
331 int startID = blockIdx.x * blockDim.x + threadIdx.x;
332 for (int i = startID; i < n; i += nThreads) {
333 a[i].x = b[i] - c[i].x;
334 a[i].y = 0.0 - c[i].y;
335 }
336 }
337
338 /*
339 * Vector subtraction, a[i] = b[i] - c[i] (mixed, c real).
340 *
341 * \param a output array (LHS)
342 * \param b input array (RHS)
343 * \param c input array (RHS)
344 * \param n size of arrays
345 */
346 __global__
347 void _subVV(cudaComplex* a,
348 cudaComplex const * b,
349 cudaReal const * c, const int n)
350 {
351 int nThreads = blockDim.x * gridDim.x;
352 int startID = blockIdx.x * blockDim.x + threadIdx.x;
353 for (int i = startID; i < n; i += nThreads) {
354 a[i].x = b[i].x - c[i];
355 a[i].y = b[i].y;
356 }
357 }
358
359 /*
360 * Vector-scalar subtraction, a[i] = b[i] - c (real).
361 *
362 * \param a output array (LHS)
363 * \param b input array (RHS)
364 * \param c input scalar (RHS)
365 * \param n size of arrays
366 */
367 __global__
368 void _subVS(cudaReal* a,
369 cudaReal const * b,
370 const cudaReal c, const int n)
371 {
372 int nThreads = blockDim.x * gridDim.x;
373 int startID = blockIdx.x * blockDim.x + threadIdx.x;
374 for (int i = startID; i < n; i += nThreads) {
375 a[i] = b[i] - c;
376 }
377 }
378
379 /*
380 * Vector-scalar subtraction, a[i] = b[i] - c (complex).
381 *
382 * \param a output array (LHS)
383 * \param b input array (RHS)
384 * \param c input scalar (RHS)
385 * \param n size of arrays
386 */
387 __global__
388 void _subVS(cudaComplex* a,
389 cudaComplex const * b,
390 const cudaComplex c, const int n)
391 {
392 int nThreads = blockDim.x * gridDim.x;
393 int startID = blockIdx.x * blockDim.x + threadIdx.x;
394 for (int i = startID; i < n; i += nThreads) {
395 a[i].x = b[i].x - c.x;
396 a[i].y = b[i].y - c.y;
397 }
398 }
399
400 /*
401 * Vector-scalar subtraction, a[i] = b[i] - c (mixed, b real).
402 *
403 * \param a complex output array (LHS)
404 * \param b real input array (RHS)
405 * \param c real input scalar (RHS)
406 * \param n size of arrays
407 */
408 __global__
409 void _subVS(cudaComplex* a, cudaReal const * b,
410 const cudaComplex c, const int n)
411 {
412 int nThreads = blockDim.x * gridDim.x;
413 int startID = blockIdx.x * blockDim.x + threadIdx.x;
414 for (int i = startID; i < n; i += nThreads) {
415 a[i].x = b[i] - c.x;
416 a[i].y = 0.0 - c.y;
417 }
418 }
419
420 /*
421 * Vector subtraction, a[i] = b[i] - c (mixed, c real).
422 *
423 * \param a output array (LHS)
424 * \param b input array (RHS)
425 * \param c input scalar (RHS)
426 * \param n size of arrays
427 */
428 __global__
429 void _subVS(cudaComplex* a,
430 cudaComplex const * b,
431 const cudaReal c, const int n)
432 {
433 int nThreads = blockDim.x * gridDim.x;
434 int startID = blockIdx.x * blockDim.x + threadIdx.x;
435 for (int i = startID; i < n; i += nThreads) {
436 a[i].x = b[i].x - c;
437 a[i].y = b[i].y;
438 }
439 }
440
441 /*
442 * Vector multiplication, a[i] = b[i] * c[i] (real).
443 *
444 * \param a output array (LHS)
445 * \param b input array (RHS)
446 * \param c input array (RHS)
447 * \param n size of arrays
448 */
449 __global__
450 void _mulVV(cudaReal* a,
451 cudaReal const * b,
452 cudaReal const * c, const int n)
453 {
454 int nThreads = blockDim.x * gridDim.x;
455 int startID = blockIdx.x * blockDim.x + threadIdx.x;
456 for (int i = startID; i < n; i += nThreads) {
457 a[i] = b[i] * c[i];
458 }
459 }
460
461 /*
462 * Vector multiplication, a[i] = b[i] * c[i](complex).
463 *
464 * \param a output array (LHS)
465 * \param b input array (RHS)
466 * \param c input array (RHS)
467 * \param n size of arrays
468 */
469 __global__
470 void _mulVV(cudaComplex* a,
471 cudaComplex const * b,
472 cudaComplex const * c, const int n)
473 {
474 int nThreads = blockDim.x * gridDim.x;
475 int startID = blockIdx.x * blockDim.x + threadIdx.x;
476 for (int i = startID; i < n; i += nThreads) {
477 a[i].x = (b[i].x * c[i].x) - (b[i].y * c[i].y);
478 a[i].y = (b[i].x * c[i].y) + (b[i].y * c[i].x);
479 }
480 }
481
482 /*
483 * Vector multiplication, a[i] = b[i] * c[i] (mixed, b real).
484 *
485 * \param a complex output array (LHS)
486 * \param b real input array (RHS)
487 * \param c complex input array (RHS)
488 * \param n size of arrays
489 */
490 __global__ void _mulVV(cudaComplex* a,
491 cudaReal const * b,
492 cudaComplex const * c, const int n)
493 {
494 int nThreads = blockDim.x * gridDim.x;
495 int startID = blockIdx.x * blockDim.x + threadIdx.x;
496 for (int i = startID; i < n; i += nThreads) {
497 a[i].x = b[i] * c[i].x;
498 a[i].y = b[i] * c[i].y;
499 }
500 }
501
502 /*
503 * Vector multiplication, a[i] = b[i] * c[i] (mixed, c real).
504 *
505 * \param a complex output array (LHS)
506 * \param b complex input array (RHS)
507 * \param c real input array (RHS)
508 * \param n size of arrays
509 */
510 __global__ void _mulVV(cudaComplex* a,
511 cudaComplex const * b,
512 cudaReal const * c, const int n)
513 {
514 int nThreads = blockDim.x * gridDim.x;
515 int startID = blockIdx.x * blockDim.x + threadIdx.x;
516 for (int i = startID; i < n; i += nThreads) {
517 a[i].x = b[i].x * c[i];
518 a[i].y = b[i].y * c[i];
519 }
520 }
521
522 /*
523 * Vector multiplication, a[i] = b[i] * c (real).
524 *
525 * \param a output array (LHS)
526 * \param b input array (RHS)
527 * \param c input scalar (RHS)
528 * \param n size of arrays
529 */
530 __global__ void _mulVS(cudaReal* a, cudaReal const * b,
531 const cudaReal c, const int n)
532 {
533 int nThreads = blockDim.x * gridDim.x;
534 int startID = blockIdx.x * blockDim.x + threadIdx.x;
535 for (int i = startID; i < n; i += nThreads) {
536 a[i] = b[i] * c;
537 }
538 }
539
540 /*
541 * Vector-scalar multiplication, a[i] = b[i] * c(complex).
542 *
543 * \param a output array (LHS)
544 * \param b input array (RHS)
545 * \param c input scalar (RHS)
546 * \param n size of arrays
547 */
548 __global__
549 void _mulVS(cudaComplex* a,
550 cudaComplex const * b,
551 const cudaComplex c, const int n)
552 {
553 int nThreads = blockDim.x * gridDim.x;
554 int startID = blockIdx.x * blockDim.x + threadIdx.x;
555 for (int i = startID; i < n; i += nThreads) {
556 a[i].x = (b[i].x * c.x) - (b[i].y * c.y);
557 a[i].y = (b[i].x * c.y) + (b[i].y * c.x);
558 }
559 }
560
561 /*
562 * Vector multiplication, a[i] = b[i] * c (mixed, b real).
563 *
564 * \param a output array (LHS)
565 * \param b input array (RHS)
566 * \param c input scalar (RHS)
567 * \param n size of arrays
568 */
569 __global__
570 void _mulVS(cudaComplex* a,
571 cudaReal const * b,
572 const cudaComplex c, const int n)
573 {
574 int nThreads = blockDim.x * gridDim.x;
575 int startID = blockIdx.x * blockDim.x + threadIdx.x;
576 for (int i = startID; i < n; i += nThreads) {
577 a[i].x = b[i] * c.x;
578 a[i].y = b[i] * c.y;
579 }
580 }
581
582 /*
583 * Vector multiplication, a[i] = b[i] * c (mixed, c real).
584 *
585 * \param a output array (LHS)
586 * \param b input array (RHS)
587 * \param c input scalar (RHS)
588 * \param n size of arrays
589 */
590 __global__ void _mulVS(cudaComplex* a, cudaComplex const * b,
591 const cudaReal c, const int n)
592 {
593 int nThreads = blockDim.x * gridDim.x;
594 int startID = blockIdx.x * blockDim.x + threadIdx.x;
595 for (int i = startID; i < n; i += nThreads) {
596 a[i].x = b[i].x * c;
597 a[i].y = b[i].y * c;
598 }
599 }
600
601 /*
602 * Vector division, a[i] = b[i] / c[i] (real).
603 *
604 * \param a output array (LHS)
605 * \param b input array (RHS)
606 * \param c input array (RHS)
607 * \param n size of arrays
608 */
609 __global__ void _divVV(cudaReal* a, cudaReal const * b,
610 cudaReal const * c, const int n)
611 {
612 int nThreads = blockDim.x * gridDim.x;
613 int startID = blockIdx.x * blockDim.x + threadIdx.x;
614 for (int i = startID; i < n; i += nThreads) {
615 a[i] = b[i] / c[i];
616 }
617 }
618
619 /*
620 * Vector division, a[i] = b[i] / c[i] (mixed, c real).
621 *
622 * \param a output array (LHS)
623 * \param b input array (RHS)
624 * \param c input array (RHS)
625 * \param n size of arrays
626 */
627 __global__ void _divVV(cudaComplex* a, cudaComplex const * b,
628 cudaReal const * c, const int n)
629 {
630 int nThreads = blockDim.x * gridDim.x;
631 int startID = blockIdx.x * blockDim.x + threadIdx.x;
632 for (int i = startID; i < n; i += nThreads) {
633 a[i].x = b[i].x / c[i];
634 a[i].y = b[i].y / c[i];
635 }
636 }
637
638 /*
639 * Vector division, a[i] = b[i] / c (real).
640 *
641 * \param a output array (LHS)
642 * \param b input array (RHS)
643 * \param c input scalar (RHS)
644 * \param n size of arrays
645 */
646 __global__ void _divVS(cudaReal* a, cudaReal const * b,
647 const cudaReal c, const int n)
648 {
649 int nThreads = blockDim.x * gridDim.x;
650 int startID = blockIdx.x * blockDim.x + threadIdx.x;
651 for (int i = startID; i < n; i += nThreads) {
652 a[i] = b[i] / c;
653 }
654 }
655
656 /*
657 * Vector division, a[i] = b[i] / c (mixed, c real).
658 *
659 * \param a output array (LHS)
660 * \param b input array (RHS)
661 * \param c input scalar (RHS)
662 * \param n size of arrays
663 */
664 __global__ void _divVS(cudaComplex* a, cudaComplex const * b,
665 const cudaReal c, const int n)
666 {
667 int nThreads = blockDim.x * gridDim.x;
668 int startID = blockIdx.x * blockDim.x + threadIdx.x;
669 for (int i = startID; i < n; i += nThreads) {
670 a[i].x = b[i].x / c;
671 a[i].y = b[i].y / c;
672 }
673 }
674
675 /*
676 * Vector division, a[i] = b / c[i] (real).
677 *
678 * \param a output array (LHS)
679 * \param b input scalar (RHS)
680 * \param c input array (RHS)
681 * \param n size of arrays
682 */
683 __global__ void _divSV(cudaReal* a, const cudaReal b,
684 cudaReal const * c, const int n)
685 {
686 int nThreads = blockDim.x * gridDim.x;
687 int startID = blockIdx.x * blockDim.x + threadIdx.x;
688 for (int i = startID; i < n; i += nThreads) {
689 a[i] = b / c[i];
690 }
691 }
692
693 // In-place addition
694
695 /*
696 * Vector in-place addition, a[i] += b[i] (real).
697 *
698 * \param a real array (LHS)
699 * \param b real array (RHS)
700 * \param n size of arrays
701 */
702 __global__
703 void _addEqV(cudaReal* a, cudaReal const * b, const int n)
704 {
705 int nThreads = blockDim.x * gridDim.x;
706 int startID = blockIdx.x * blockDim.x + threadIdx.x;
707 for (int i = startID; i < n; i += nThreads) {
708 a[i] += b[i];
709 }
710 }
711
712 /*
713 * Vector in-place addition, a[i] += b[i] (complex).
714 *
715 * \param a output array (LHS)
716 * \param b input array (RHS)
717 * \param n size of arrays
718 */
719 __global__
720 void _addEqV(cudaComplex* a, cudaComplex const * b, const int n)
721 {
722 int nThreads = blockDim.x * gridDim.x;
723 int startID = blockIdx.x * blockDim.x + threadIdx.x;
724 for (int i = startID; i < n; i += nThreads) {
725 a[i].x += b[i].x;
726 a[i].y += b[i].y;
727 }
728 }
729
730 /*
731 * Vector in-place addition, a[i] += (b[i],c[i]) (complex, real/imag).
732 *
733 * \param a complex input / output array (LHS)
734 * \param b real array, increment to real part (RHS)
735 * \param c real array, increment to imaginary part (RHS)
736 * \param n size of arrays
737 */
738 __global__
739 void _addEqV(cudaComplex* a,
740 cudaReal const * b,
741 cudaReal const * c,
742 const int n)
743 {
744 int nThreads = blockDim.x * gridDim.x;
745 int startID = blockIdx.x * blockDim.x + threadIdx.x;
746 for (int i = startID; i < n; i += nThreads) {
747 a[i].x += b[i];
748 a[i].y += c[i];
749 }
750 }
751
752 /*
753 * Vector addition in-place, a[i] += b[i] (mixed).
754 *
755 * \param a output array (LHS)
756 * \param b input array (RHS)
757 * \param n size of arrays
758 */
759 __global__
760 void _addEqV(cudaComplex* a, cudaReal const * b, const int n)
761 {
762 int nThreads = blockDim.x * gridDim.x;
763 int startID = blockIdx.x * blockDim.x + threadIdx.x;
764 for (int i = startID; i < n; i += nThreads) {
765 a[i].x += b[i];
766 }
767 }
768
769 /*
770 * Vector addition in-place, a[i] += b (real).
771 *
772 * \param a output array (LHS)
773 * \param b input scalar (RHS)
774 * \param n size of arrays
775 */
776 __global__
777 void _addEqS(cudaReal* a, const cudaReal b, const int n)
778 {
779 int nThreads = blockDim.x * gridDim.x;
780 int startID = blockIdx.x * blockDim.x + threadIdx.x;
781 for (int i = startID; i < n; i += nThreads) {
782 a[i] += b;
783 }
784 }
785
786 /*
787 * Vector-scalar in-place addition, a[i] += b (complex).
788 *
789 * \param a complex array (LHS)
790 * \param b complex scalar (RHS)
791 * \param n size of arrays
792 */
793 __global__
794 void _addEqS(cudaComplex* a, const cudaComplex b,
795 const int n)
796 {
797 int nThreads = blockDim.x * gridDim.x;
798 int startID = blockIdx.x * blockDim.x + threadIdx.x;
799 for (int i = startID; i < n; i += nThreads) {
800 a[i].x += b.x;
801 a[i].y += b.y;
802 }
803 }
804
805 /*
806 * Vector addition in-place, a[i] += b (mixed).
807 *
808 * \param a complex array (LHS)
809 * \param b real scalar (RHS)
810 * \param n size of arrays
811 */
812 __global__
813 void _addEqS(cudaComplex* a, const cudaReal b, const int n)
814 {
815 int nThreads = blockDim.x * gridDim.x;
816 int startID = blockIdx.x * blockDim.x + threadIdx.x;
817 for (int i = startID; i < n; i += nThreads) {
818 a[i].x += b;
819 }
820 }
821
822 /*
823 * Vector in-place subtraction, a[i] -= b[i] (real).
824 *
825 * \param a real array (LHS)
826 * \param b real array (RHS)
827 * \param n size of arrays
828 */
829 __global__
830 void _subEqV(cudaReal* a, cudaReal const * b, const int n)
831 {
832 int nThreads = blockDim.x * gridDim.x;
833 int startID = blockIdx.x * blockDim.x + threadIdx.x;
834 for (int i = startID; i < n; i += nThreads) {
835 a[i] -= b[i];
836 }
837 }
838
839 /*
840 * Vector in-place subtraction, a[i] -= b[i] (complex).
841 *
842 * \param a complex array (LHS)
843 * \param b complex array (RHS)
844 * \param n size of arrays
845 */
846 __global__
847 void _subEqV(cudaComplex* a, cudaComplex const * b, const int n)
848 {
849 int nThreads = blockDim.x * gridDim.x;
850 int startID = blockIdx.x * blockDim.x + threadIdx.x;
851 for (int i = startID; i < n; i += nThreads) {
852 a[i].x -= b[i].x;
853 a[i].y -= b[i].y;
854 }
855 }
856
857 /*
858 * Vector in-place subtraction, a[i] -= b[i] (mixed).
859 *
860 * \param a complex array (LHS)
861 * \param b real array (RHS)
862 * \param n size of arrays
863 */
864 __global__
865 void _subEqV(cudaComplex* a, cudaReal const * b, const int n)
866 {
867 int nThreads = blockDim.x * gridDim.x;
868 int startID = blockIdx.x * blockDim.x + threadIdx.x;
869 for (int i = startID; i < n; i += nThreads) {
870 a[i].x -= b[i];
871 }
872 }
873
874 /*
875 * Vector in-place subtraction, a[i] -= b (real).
876 *
877 * \param a real array (LHS)
878 * \param b real scalar (RHS)
879 * \param n size of arrays
880 */
881 __global__
882 void _subEqS(cudaReal* a, const cudaReal b, const int n)
883 {
884 int nThreads = blockDim.x * gridDim.x;
885 int startID = blockIdx.x * blockDim.x + threadIdx.x;
886 for (int i = startID; i < n; i += nThreads) {
887 a[i] -= b;
888 }
889 }
890
891 /*
892 * Vector in-place subtraction, a[i] -= b (complex).
893 *
894 * \param a complex array (LHS)
895 * \param b complex scalar (RHS)
896 * \param n size of arrays
897 */
898 __global__
899 void _subEqS(cudaComplex* a, const cudaComplex b, const int n)
900 {
901 int nThreads = blockDim.x * gridDim.x;
902 int startID = blockIdx.x * blockDim.x + threadIdx.x;
903 for (int i = startID; i < n; i += nThreads) {
904 a[i].x -= b.x;
905 a[i].y -= b.y;
906 }
907 }
908
909 /*
910 * Vector in-place subtraction, a[i] -= b (mixed).
911 *
912 * \param a complex array (LHS)
913 * \param b real scalar (RHS)
914 * \param n size of arrays
915 */
916 __global__
917 void _subEqS(cudaComplex* a, const cudaReal b, const int n)
918 {
919 int nThreads = blockDim.x * gridDim.x;
920 int startID = blockIdx.x * blockDim.x + threadIdx.x;
921 for (int i = startID; i < n; i += nThreads) {
922 a[i].x -= b;
923 }
924 }
925
926 /*
927 * Vector in-place multiplication, a[i] *= b[i] (real).
928 *
929 * \param a real array (LHS)
930 * \param b real array (RHS)
931 * \param n size of arrays
932 */
933 __global__
934 void _mulEqV(cudaReal* a, cudaReal const * b, const int n)
935 {
936 int nThreads = blockDim.x * gridDim.x;
937 int startID = blockIdx.x * blockDim.x + threadIdx.x;
938 for (int i = startID; i < n; i += nThreads) {
939 a[i] *= b[i];
940 }
941 }
942
943 /*
944 * Vector in-place multiplication, a[i] *= b[i], (complex).
945 *
946 * \param a complex array (LHS)
947 * \param b complex array (RHS)
948 * \param n size of arrays
949 */
950 __global__
951 void _mulEqV(cudaComplex* a, cudaComplex const * b, const int n)
952 {
953 int nThreads = blockDim.x * gridDim.x;
954 int startID = blockIdx.x * blockDim.x + threadIdx.x;
955 cudaComplex c;
956 for (int i = startID; i < n; i += nThreads) {
957 c.x = (a[i].x * b[i].x) - (a[i].y * b[i].y);
958 c.y = (a[i].x * b[i].y) + (a[i].y * b[i].x);
959 a[i].x = c.x;
960 a[i].y = c.y;
961 }
962 }
963
964 /*
965 * Vector in-place multiplication, a[i]*=b[i] (mixed).
966 *
967 * \param a complex array (LHS)
968 * \param b real array (RHS)
969 * \param n size of arrays
970 */
971 __global__
972 void _mulEqV(cudaComplex* a, cudaReal const * b, const int n)
973 {
974 int nThreads = blockDim.x * gridDim.x;
975 int startID = blockIdx.x * blockDim.x + threadIdx.x;
976 for (int i = startID; i < n; i += nThreads) {
977 a[i].x *= b[i];
978 a[i].y *= b[i];
979 }
980 }
981
982 /*
983 * Vector-scalar in-place multiplication, a[i] *= b (real).
984 *
985 * \param a real array (LHS)
986 * \param b real scalar (RHS)
987 * \param n size of arrays
988 */
989 __global__
990 void _mulEqS(cudaReal* a, const cudaReal b, const int n)
991 {
992 int nThreads = blockDim.x * gridDim.x;
993 int startID = blockIdx.x * blockDim.x + threadIdx.x;
994 for (int i = startID; i < n; i += nThreads) {
995 a[i] *= b;
996 }
997 }
998
999 /*
1000 * Vector-scalar in-place multiplication, a[i] *= b (complex).
1001 *
1002 * \param a complex array (LHS)
1003 * \param b complex scalar (RHS)
1004 * \param n size of arrays
1005 */
1006 __global__
1007 void _mulEqS(cudaComplex* a, const cudaComplex b, const int n)
1008 {
1009 int nThreads = blockDim.x * gridDim.x;
1010 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1011 cudaComplex c;
1012 for (int i = startID; i < n; i += nThreads) {
1013 c.x = (a[i].x * b.x) - (a[i].y * b.y);
1014 c.y = (a[i].x * b.y) + (a[i].y * b.x);
1015 a[i].x = c.x;
1016 a[i].y = c.y;
1017 }
1018 }
1019
1020 /*
1021 * Vector in-place multiplication, a[i] *= b (mixed).
1022 *
1023 * \param a complex array (LHS)
1024 * \param b real scalar (RHS)
1025 * \param n size of arrays
1026 */
1027 __global__
1028 void _mulEqS(cudaComplex* a, const cudaReal b, const int n)
1029 {
1030 int nThreads = blockDim.x * gridDim.x;
1031 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1032 for (int i = startID; i < n; i += nThreads) {
1033 a[i].x *= b;
1034 a[i].y *= b;
1035 }
1036 }
1037
1038 /*
1039 * Vector in-place division, a[i] /= b[i] (real).
1040 *
1041 * \param a real array (LHS)
1042 * \param b real array (RHS)
1043 * \param n size of arrays
1044 */
1045 __global__
1046 void _divEqV(cudaReal* a, cudaReal const * b, const int n)
1047 {
1048 int nThreads = blockDim.x * gridDim.x;
1049 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1050 for (int i = startID; i < n; i += nThreads) {
1051 a[i] /= b[i];
1052 }
1053 }
1054
1055 /*
1056 * Vector in-place division, a[i] /= b[i] (mixed, b = real).
1057 *
1058 * \param a complex array (LHS)
1059 * \param b real array (RHS)
1060 * \param n size of arrays
1061 */
1062 __global__
1063 void _divEqV(cudaComplex* a, cudaReal const * b, const int n)
1064 {
1065 int nThreads = blockDim.x * gridDim.x;
1066 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1067 for (int i = startID; i < n; i += nThreads) {
1068 a[i].x /= b[i];
1069 a[i].y /= b[i];
1070 }
1071 }
1072
1073 /*
1074 * Vector in-place division, a[i] /= b (real).
1075 *
1076 * \param a real array (LHS)
1077 * \param b real scalar (RHS)
1078 * \param n size of arrays
1079 */
1080 __global__
1081 void _divEqS(cudaReal* a, const cudaReal b, const int n)
1082 {
1083 int nThreads = blockDim.x * gridDim.x;
1084 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1085 for (int i = startID; i < n; i += nThreads) {
1086 a[i] /= b;
1087 }
1088 }
1089
1090 /*
1091 * Vector in-place division, a[i] /= b (mixed, b = real).
1092 *
1093 * \param a complex array (LHS)
1094 * \param b real scalar (RHS)
1095 * \param n size of arrays
1096 */
1097 __global__
1098 void _divEqS(cudaComplex* a, const cudaReal b, const int n)
1099 {
1100 int nThreads = blockDim.x * gridDim.x;
1101 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1102 for (int i = startID; i < n; i += nThreads) {
1103 a[i].x /= b;
1104 a[i].y /= b;
1105 }
1106 }
1107
1108 // Exponentiation
1109
1110 /*
1111 * Vector exponentiation, a[i] = exp(b[i]) (real).
1112 *
1113 * \param a real array (LHS)
1114 * \param b real array (RHS)
1115 * \param n size of arrays
1116 */
1117 __global__
1118 void _expV(cudaReal* a, cudaReal const * b, const int n)
1119 {
1120 int nThreads = blockDim.x * gridDim.x;
1121 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1122 for (int i = startID; i < n; i += nThreads) {
1123 a[i] = exp(b[i]);
1124 }
1125 }
1126
1127 /*
1128 * Vector exponentiation, a[i] = exp(b[i]) (complex).
1129 *
1130 * \param a complex array (LHS)
1131 * \param b complex array (RHS)
1132 * \param n size of arrays
1133 */
1134 __global__
1135 void _expV(cudaComplex* a, cudaComplex const * b, const int n)
1136 {
1137 int nThreads = blockDim.x * gridDim.x;
1138 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1139 for (int i = startID; i < n; i += nThreads) {
1140 a[i].x = exp(b[i].x) * cos(b[i].y);
1141 a[i].y = exp(b[i].x) * sin(b[i].y);
1142 }
1143 }
1144
1145 // Square
1146
1147 /*
1148 * Square of real vector, a[i] = (b[i])^2 (real).
1149 *
1150 * \param a real array (LHS)
1151 * \param b real array (RHS)
1152 * \param n size of arrays
1153 */
1154 __global__
1155 void _sqV(cudaReal* a, cudaReal const * b, const int n)
1156 {
1157 int nThreads = blockDim.x * gridDim.x;
1158 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1159 for (int i = startID; i < n; i += nThreads) {
1160 a[i] = b[i]*b[i];
1161 }
1162 }
1163
1164 /*
1165 * Square of complex vector, a[i] = (b[i])^2 (complex).
1166 *
1167 * \param a complex array (LHS)
1168 * \param b complex array (RHS)
1169 * \param n size of arrays
1170 */
1171 __global__
1172 void _sqV(cudaComplex* a, cudaComplex const * b, const int n)
1173 {
1174 int nThreads = blockDim.x * gridDim.x;
1175 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1176 cudaReal bx, by;
1177 for (int i = startID; i < n; i += nThreads) {
1178 bx = b[i].x;
1179 by = b[i].y;
1180 a[i].x = (bx*bx) - (by*by);
1181 a[i].y = 2.0 * bx * by;
1182 }
1183 }
1184
1185 // Absolute magnitude
1186
1187 /*
1188 * Vector absolute magnitude, a[i] = abs(b[i]) (real).
1189 *
1190 * \param a real array (LHS)
1191 * \param b real array (RHS)
1192 * \param n size of arrays
1193 */
1194 __global__
1195 void _absV(cudaReal* a, cudaReal const * b, const int n)
1196 {
1197 int nThreads = blockDim.x * gridDim.x;
1198 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1199 for (int i = startID; i < n; i += nThreads) {
1200 a[i] = std::fabs(b[i]);
1201 }
1202 }
1203
1204 /*
1205 * Vector absolute magnitude, a[i] = |b[i]|^2 (complex).
1206 *
1207 * \param a real array (LHS)
1208 * \param b complex array (RHS)
1209 * \param n size of arrays
1210 */
1211 __global__
1212 void _sqAbsV(cudaReal* a, cudaComplex const * b, const int n)
1213 {
1214 int nThreads = blockDim.x * gridDim.x;
1215 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1216 cudaReal bx, by;
1217 for (int i = startID; i < n; i += nThreads) {
1218 bx = b[i].x;
1219 by = b[i].y;
1220 a[i] = bx*bx + by*by;
1221 }
1222 }
1223
1224 } // end anonymous namespace
1225
1226 // CUDA kernel wrappers:
1227
1228 /*
1229 * Vector assignment, a[i] = b[i] (real).
1230 */
1232 DeviceArray<cudaReal> const & b,
1233 const int beginIdA, const int beginIdB, const int n)
1234 {
1235 UTIL_CHECK(a.capacity() >= n + beginIdA);
1236 UTIL_CHECK(b.capacity() >= n + beginIdB);
1237
1238 // GPU resources
1239 int nBlocks, nThreads;
1240 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1241
1242 #if 0
1243 // Launch kernel
1244 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1245 b.cArray()+beginIdB, n);
1246 cudaErrorCheck( cudaGetLastError() );
1247 #endif
1248
1249 // Copy all elements
1250 cudaErrorCheck( cudaMemcpy(a.cArray() + beginIdA,
1251 b.cArray() + beginIdB,
1252 n * sizeof(cudaReal),
1253 cudaMemcpyDeviceToDevice) );
1254
1255 }
1256
1257 /*
1258 * Vector assignment, a[i] = b[i] (real, device to host).
1259 */
1261 DeviceArray<cudaReal> const & b,
1262 const int beginIdA, const int beginIdB, const int n)
1263 {
1264 UTIL_CHECK(a.capacity() >= n + beginIdA);
1265 UTIL_CHECK(b.capacity() >= n + beginIdB);
1266
1267 // GPU resources
1268 int nBlocks, nThreads;
1269 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1270
1271 // Copy all elements
1272 cudaErrorCheck( cudaMemcpy(a.cArray() + beginIdA,
1273 b.cArray() + beginIdB,
1274 n * sizeof(cudaReal),
1275 cudaMemcpyDeviceToHost) );
1276
1277 }
1278
1279 /*
1280 * Vector assignment, a[i] = b[i] (real, device to host).
1281 */
1283 Array<cudaReal> const & b,
1284 const int beginIdA, const int beginIdB, const int n)
1285 {
1286 UTIL_CHECK(a.capacity() >= n + beginIdA);
1287 UTIL_CHECK(b.capacity() >= n + beginIdB);
1288
1289 // GPU resources
1290 int nBlocks, nThreads;
1291 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1292
1293 // Copy all elements
1294 cudaErrorCheck( cudaMemcpy(a.cArray() + beginIdA,
1295 b.cArray() + beginIdB,
1296 n * sizeof(cudaReal),
1297 cudaMemcpyHostToDevice) );
1298
1299 }
1300
1301 /*
1302 * Vector assignment, a[i] = b[i] (complex).
1303 */
1305 DeviceArray<cudaComplex> const & b,
1306 const int beginIdA, const int beginIdB, const int n)
1307 {
1308 UTIL_CHECK(a.capacity() >= n + beginIdA);
1309 UTIL_CHECK(b.capacity() >= n + beginIdB);
1310
1311 // GPU resources
1312 int nBlocks, nThreads;
1313 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1314
1315 // Launch kernel
1316 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1317 b.cArray()+beginIdB, n);
1318 cudaErrorCheck( cudaGetLastError() );
1319 }
1320
1321 /*
1322 * Vector assignment, a[i] = (b[i], c[i]) (complex, real & imaginary).
1323 */
1325 DeviceArray<cudaReal> const & b, // real part
1326 DeviceArray<cudaReal> const & c, // imaginary part
1327 const int beginIdA, const int beginIdB, const int beginIdC,
1328 const int n)
1329 {
1330 UTIL_CHECK(a.capacity() >= n + beginIdA);
1331 UTIL_CHECK(b.capacity() >= n + beginIdB);
1332 UTIL_CHECK(c.capacity() >= n + beginIdC);
1333
1334 // GPU resources
1335 int nBlocks, nThreads;
1336 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1337
1338 // Launch kernel
1339 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1340 b.cArray()+beginIdB,
1341 c.cArray()+beginIdC, n);
1342 cudaErrorCheck( cudaGetLastError() );
1343 }
1344
1345 /*
1346 * Vector-scalar assignment, a[i] = b (real).
1347 */
1349 const cudaReal b,
1350 const int beginIdA, const int n)
1351 {
1352 UTIL_CHECK(a.capacity() >= n + beginIdA);
1353
1354 // GPU resources
1355 int nBlocks, nThreads;
1356 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1357
1358 // Launch kernel
1359 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1360 cudaErrorCheck( cudaGetLastError() );
1361 }
1362
1363 /*
1364 * Vector-scalar assignment, a[i] = b (complex).
1365 */
1367 const cudaComplex b,
1368 const int beginIdA, const int n)
1369 {
1370 UTIL_CHECK(a.capacity() >= n + beginIdA);
1371
1372 // GPU resources
1373 int nBlocks, nThreads;
1374 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1375
1376 // Launch kernel
1377 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1378 cudaErrorCheck( cudaGetLastError() );
1379 }
1380
1381 // Binary arithmetic operations, separate array for result
1382
1383 /*
1384 * Vector addition, a[i] = b[i] + c[i] (real).
1385 */
1387 DeviceArray<cudaReal> const & b,
1388 DeviceArray<cudaReal> const & c,
1389 const int beginIdA, const int beginIdB, const int beginIdC,
1390 const int n)
1391 {
1392 UTIL_CHECK(a.capacity() >= n + beginIdA);
1393 UTIL_CHECK(b.capacity() >= n + beginIdB);
1394 UTIL_CHECK(c.capacity() >= n + beginIdC);
1395
1396 // GPU resources
1397 int nBlocks, nThreads;
1398 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1399
1400 // Launch kernel
1401 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1402 b.cArray()+beginIdB,
1403 c.cArray()+beginIdC, n);
1404 cudaErrorCheck( cudaGetLastError() );
1405 }
1406
1407 /*
1408 * Vector addition, a[i] = b[i] + c[i] (complex).
1409 */
1411 DeviceArray<cudaComplex> const & b,
1412 DeviceArray<cudaComplex> const & c,
1413 const int beginIdA, const int beginIdB, const int beginIdC,
1414 const int n)
1415 {
1416 UTIL_CHECK(a.capacity() >= n + beginIdA);
1417 UTIL_CHECK(b.capacity() >= n + beginIdB);
1418 UTIL_CHECK(c.capacity() >= n + beginIdC);
1419
1420 // GPU resources
1421 int nBlocks, nThreads;
1422 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1423
1424 // Launch kernel
1425 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1426 b.cArray()+beginIdB,
1427 c.cArray()+beginIdC, n);
1428 cudaErrorCheck( cudaGetLastError() );
1429 }
1430
1431 /*
1432 * Vector addition, a[i] = b[i] + c[i] (mixed, b real).
1433 */
1435 DeviceArray<cudaReal> const & b,
1436 DeviceArray<cudaComplex> const & c,
1437 const int beginIdA, const int beginIdB, const int beginIdC,
1438 const int n)
1439 {
1440 UTIL_CHECK(a.capacity() >= n + beginIdA);
1441 UTIL_CHECK(b.capacity() >= n + beginIdB);
1442 UTIL_CHECK(c.capacity() >= n + beginIdC);
1443
1444 // GPU resources
1445 int nBlocks, nThreads;
1446 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1447
1448 // Launch kernel
1449 _addVV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
1450 b.cArray() + beginIdB,
1451 c.cArray() + beginIdC, n);
1452 cudaErrorCheck( cudaGetLastError() );
1453 }
1454
1455 /*
1456 * Vector addition, a[i] = b[i] + c[i] (mixed, c real).
1457 */
1459 DeviceArray<cudaComplex> const & b,
1460 DeviceArray<cudaReal> const & c,
1461 const int beginIdA, const int beginIdB, const int beginIdC,
1462 const int n)
1463 {
1464 UTIL_CHECK(a.capacity() >= n + beginIdA);
1465 UTIL_CHECK(b.capacity() >= n + beginIdB);
1466 UTIL_CHECK(c.capacity() >= n + beginIdC);
1467
1468 // GPU resources
1469 int nBlocks, nThreads;
1470 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1471
1472 // Launch kernel
1473 _addVV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
1474 b.cArray() + beginIdB,
1475 c.cArray() + beginIdC, n);
1476 cudaErrorCheck( cudaGetLastError() );
1477 }
1478
1479 /*
1480 * Vector-scalar addition, a[i] = b[i] + c (real).
1481 */
1483 DeviceArray<cudaReal> const & b,
1484 const cudaReal c,
1485 const int beginIdA, const int beginIdB,int n)
1486 {
1487 UTIL_CHECK(a.capacity() >= n + beginIdA);
1488 UTIL_CHECK(b.capacity() >= n + beginIdB);
1489
1490 // GPU resources
1491 int nBlocks, nThreads;
1492 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1493
1494 // Launch kernel
1495 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1496 b.cArray()+beginIdB,
1497 c, n);
1498 cudaErrorCheck( cudaGetLastError() );
1499 }
1500
1501 // Vector-scalar addition, a[i] = b[i] + c (complex).
1503 DeviceArray<cudaComplex> const & b,
1504 const cudaComplex c,
1505 const int beginIdA, const int beginIdB,int n)
1506 {
1507 UTIL_CHECK(a.capacity() >= n + beginIdA);
1508 UTIL_CHECK(b.capacity() >= n + beginIdB);
1509
1510 // GPU resources
1511 int nBlocks, nThreads;
1512 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1513
1514 // Launch kernel
1515 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1516 b.cArray()+beginIdB,
1517 c, n);
1518 cudaErrorCheck( cudaGetLastError() );
1519 }
1520
1521 // Vector addition, a[i] = b[i] + c (mixed, b real).
1523 DeviceArray<cudaReal> const & b,
1524 const cudaComplex c,
1525 const int beginIdA, const int beginIdB,int n)
1526 {
1527 UTIL_CHECK(a.capacity() >= n + beginIdA);
1528 UTIL_CHECK(b.capacity() >= n + beginIdB);
1529
1530 // GPU resources
1531 int nBlocks, nThreads;
1532 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1533
1534 // Launch kernel
1535 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1536 b.cArray()+beginIdB,
1537 c, n);
1538 cudaErrorCheck( cudaGetLastError() );
1539 }
1540
1541 // Vector addition, a[i] = b[i] + c (mixed, c real).
1543 DeviceArray<cudaComplex> const & b,
1544 const cudaReal c,
1545 const int beginIdA, const int beginIdB,int n)
1546 {
1547 UTIL_CHECK(a.capacity() >= n + beginIdA);
1548 UTIL_CHECK(b.capacity() >= n + beginIdB);
1549
1550 // GPU resources
1551 int nBlocks, nThreads;
1552 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1553
1554 // Launch kernel
1555 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1556 b.cArray()+beginIdB,
1557 c, n);
1558 cudaErrorCheck( cudaGetLastError() );
1559 }
1560
1561 // Vector subtraction, a[i] = b[i] - c[i] (real).
1563 DeviceArray<cudaReal> const & b,
1564 DeviceArray<cudaReal> const & c,
1565 const int beginIdA,
1566 const int beginIdB, const int beginIdC,
1567 const int n)
1568 {
1569 UTIL_CHECK(a.capacity() >= n + beginIdA);
1570 UTIL_CHECK(b.capacity() >= n + beginIdB);
1571 UTIL_CHECK(c.capacity() >= n + beginIdC);
1572
1573 // GPU resources
1574 int nBlocks, nThreads;
1575 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1576
1577 // Launch kernel
1578 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1579 c.cArray()+beginIdC, n);
1580 cudaErrorCheck( cudaGetLastError() );
1581 }
1582
1583 // Vector subtraction, a[i] = b[i] - c[i] (cudaComplex).
1585 DeviceArray<cudaComplex> const & b,
1586 DeviceArray<cudaComplex> const & c,
1587 const int beginIdA, const int beginIdB, const int beginIdC,
1588 const int n)
1589 {
1590 UTIL_CHECK(a.capacity() >= n + beginIdA);
1591 UTIL_CHECK(b.capacity() >= n + beginIdB);
1592 UTIL_CHECK(c.capacity() >= n + beginIdC);
1593
1594 // GPU resources
1595 int nBlocks, nThreads;
1596 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1597
1598 // Launch kernel
1599 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1600 b.cArray()+beginIdB,
1601 c.cArray()+beginIdC, n);
1602 cudaErrorCheck( cudaGetLastError() );
1603 }
1604
1605 // Vector subtraction, a[i]=b[i]-c[i] (mixed).
1607 DeviceArray<cudaReal> const & b,
1608 DeviceArray<cudaComplex> const & c,
1609 const int beginIdA, const int beginIdB, const int beginIdC,
1610 const int n)
1611 {
1612 UTIL_CHECK(a.capacity() >= n + beginIdA);
1613 UTIL_CHECK(b.capacity() >= n + beginIdB);
1614 UTIL_CHECK(c.capacity() >= n + beginIdC);
1615
1616 // GPU resources
1617 int nBlocks, nThreads;
1618 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1619
1620 // Launch kernel
1621 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1622 b.cArray()+beginIdB,
1623 c.cArray()+beginIdC, n);
1624 cudaErrorCheck( cudaGetLastError() );
1625 }
1626
1627 // Vector subtraction, a[i]=b[i]-c[i] (mixed).
1629 DeviceArray<cudaComplex> const & b,
1630 DeviceArray<cudaReal> const & c,
1631 const int beginIdA, const int beginIdB, const int beginIdC,
1632 const int n)
1633 {
1634 UTIL_CHECK(a.capacity() >= n + beginIdA);
1635 UTIL_CHECK(b.capacity() >= n + beginIdB);
1636 UTIL_CHECK(c.capacity() >= n + beginIdC);
1637
1638 // GPU resources
1639 int nBlocks, nThreads;
1640 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1641
1642 // Launch kernel
1643 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1644 b.cArray()+beginIdB,
1645 c.cArray()+beginIdC, n);
1646 cudaErrorCheck( cudaGetLastError() );
1647 }
1648
1649 // Vector subtraction, a[i] = b[i] - c (cudaReal).
1651 DeviceArray<cudaReal> const & b,
1652 const cudaReal c,
1653 const int beginIdA, const int beginIdB,
1654 const int n)
1655 {
1656 UTIL_CHECK(a.capacity() >= n + beginIdA);
1657 UTIL_CHECK(b.capacity() >= n + beginIdB);
1658
1659 // GPU resources
1660 int nBlocks, nThreads;
1661 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1662
1663 // Launch kernel
1664 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1665 b.cArray()+beginIdB,
1666 c, n);
1667 cudaErrorCheck( cudaGetLastError() );
1668 }
1669
1670 // Vector subtraction, a[i] = b[i] - c (complex).
1672 DeviceArray<cudaComplex> const & b,
1673 const cudaComplex c, const int beginIdA, const int beginIdB,
1674 const int n)
1675 {
1676 UTIL_CHECK(a.capacity() >= n + beginIdA);
1677 UTIL_CHECK(b.capacity() >= n + beginIdB);
1678
1679 // GPU resources
1680 int nBlocks, nThreads;
1681 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1682
1683 // Launch kernel
1684 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1685 b.cArray()+beginIdB,
1686 c, n);
1687 cudaErrorCheck( cudaGetLastError() );
1688 }
1689
1690 // Vector-scalar subtraction, a[i] = b[i] - c (mixed).
1692 DeviceArray<cudaReal> const & b,
1693 const cudaComplex c, const int beginIdA, const int beginIdB,
1694 const int n)
1695 {
1696 UTIL_CHECK(a.capacity() >= n + beginIdA);
1697 UTIL_CHECK(b.capacity() >= n + beginIdB);
1698
1699 // GPU resources
1700 int nBlocks, nThreads;
1701 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1702
1703 // Launch kernel
1704 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1705 c, n);
1706 cudaErrorCheck( cudaGetLastError() );
1707 }
1708
1709 // Vector-scalar subtraction, a[i] = b[i] - c (mixed).
1711 DeviceArray<cudaComplex> const & b,
1712 const cudaReal c,
1713 const int beginIdA, const int beginIdB,
1714 const int n)
1715 {
1716 UTIL_CHECK(a.capacity() >= n + beginIdA);
1717 UTIL_CHECK(b.capacity() >= n + beginIdB);
1718
1719 // GPU resources
1720 int nBlocks, nThreads;
1721 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1722
1723 // Launch kernel
1724 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1725 c, n);
1726 cudaErrorCheck( cudaGetLastError() );
1727 }
1728
1729 // Vector multiplication, a[i] = b[i] * c[i] (real).
1731 DeviceArray<cudaReal> const & b,
1732 DeviceArray<cudaReal> const & c,
1733 const int beginIdA, const int beginIdB, const int beginIdC,
1734 const int n)
1735 {
1736 UTIL_CHECK(a.capacity() >= n + beginIdA);
1737 UTIL_CHECK(b.capacity() >= n + beginIdB);
1738 UTIL_CHECK(c.capacity() >= n + beginIdC);
1739
1740 // GPU resources
1741 int nBlocks, nThreads;
1742 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1743
1744 // Launch kernel
1745 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1746 c.cArray()+beginIdC, n);
1747 cudaErrorCheck( cudaGetLastError() );
1748 }
1749
1750 // Vector multiplication, a[i] = b[i] * c[i] (cudaComplex).
1752 DeviceArray<cudaComplex> const & b,
1753 DeviceArray<cudaComplex> const & c,
1754 const int beginIdA, const int beginIdB, const int beginIdC,
1755 const int n)
1756 {
1757 UTIL_CHECK(a.capacity() >= n + beginIdA);
1758 UTIL_CHECK(b.capacity() >= n + beginIdB);
1759 UTIL_CHECK(c.capacity() >= n + beginIdC);
1760
1761 // GPU resources
1762 int nBlocks, nThreads;
1763 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1764
1765 // Launch kernel
1766 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1767 b.cArray()+beginIdB,
1768 c.cArray()+beginIdC, n);
1769 cudaErrorCheck( cudaGetLastError() );
1770 }
1771
1772 // Vector multiplication, a[i]=b[i]*c[i] (mixed).
1774 DeviceArray<cudaReal> const & b,
1775 DeviceArray<cudaComplex> const & c,
1776 const int beginIdA, const int beginIdB, const int beginIdC,
1777 const int n)
1778 {
1779 UTIL_CHECK(a.capacity() >= n + beginIdA);
1780 UTIL_CHECK(b.capacity() >= n + beginIdB);
1781 UTIL_CHECK(c.capacity() >= n + beginIdC);
1782
1783 // GPU resources
1784 int nBlocks, nThreads;
1785 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1786
1787 // Launch kernel
1788 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1789 b.cArray()+beginIdB,
1790 c.cArray()+beginIdC, n);
1791 cudaErrorCheck( cudaGetLastError() );
1792 }
1793
1794 // Vector multiplication, a[i]=b[i]*c[i] (mixed).
1796 DeviceArray<cudaComplex> const & b,
1797 DeviceArray<cudaReal> const & c,
1798 const int beginIdA, const int beginIdB, const int beginIdC,
1799 const int n)
1800 {
1801 UTIL_CHECK(a.capacity() >= n + beginIdA);
1802 UTIL_CHECK(b.capacity() >= n + beginIdB);
1803 UTIL_CHECK(c.capacity() >= n + beginIdC);
1804
1805 // GPU resources
1806 int nBlocks, nThreads;
1807 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1808
1809 // Launch kernel
1810 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1811 b.cArray()+beginIdB,
1812 c.cArray()+beginIdC, n);
1813 cudaErrorCheck( cudaGetLastError() );
1814 }
1815
1816 // Vector-scalar multiplication, a[i] = b[i] * c (real).
1818 DeviceArray<cudaReal> const & b,
1819 const cudaReal c,
1820 const int beginIdA, const int beginIdB,
1821 const int n)
1822 {
1823 UTIL_CHECK(a.capacity() >= n + beginIdA);
1824 UTIL_CHECK(b.capacity() >= n + beginIdB);
1825
1826 // GPU resources
1827 int nBlocks, nThreads;
1828 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1829
1830 // Launch kernel
1831 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1832 b.cArray()+beginIdB,
1833 c, n);
1834 cudaErrorCheck( cudaGetLastError() );
1835 }
1836
1837 // Vector-scalar multiplication, a[i] = b[i] * c (complex).
1839 DeviceArray<cudaComplex> const & b,
1840 const cudaComplex c,
1841 const int beginIdA, const int beginIdB,
1842 const int n)
1843 {
1844 UTIL_CHECK(a.capacity() >= n + beginIdA);
1845 UTIL_CHECK(b.capacity() >= n + beginIdB);
1846
1847 // GPU resources
1848 int nBlocks, nThreads;
1849 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1850
1851 // Launch kernel
1852 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1853 c, n);
1854 cudaErrorCheck( cudaGetLastError() );
1855 }
1856
1857 // Vector-scalar multiplication, a[i] = b[i] * c (mixed).
1859 DeviceArray<cudaReal> const & b,
1860 const cudaComplex c,
1861 const int beginIdA, const int beginIdB, const int n)
1862 {
1863 UTIL_CHECK(a.capacity() >= n + beginIdA);
1864 UTIL_CHECK(b.capacity() >= n + beginIdB);
1865
1866 // GPU resources
1867 int nBlocks, nThreads;
1868 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1869
1870 // Launch kernel
1871 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1872 c, n);
1873 cudaErrorCheck( cudaGetLastError() );
1874 }
1875
1876 // Vector multiplication, a[i] = b[i] * c (mixed).
1878 DeviceArray<cudaComplex> const & b,
1879 const cudaReal c,
1880 const int beginIdA, const int beginIdB,
1881 const int n)
1882 {
1883 UTIL_CHECK(a.capacity() >= n + beginIdA);
1884 UTIL_CHECK(b.capacity() >= n + beginIdB);
1885
1886 // GPU resources
1887 int nBlocks, nThreads;
1888 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1889
1890 // Launch kernel
1891 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1892 c, n);
1893 cudaErrorCheck( cudaGetLastError() );
1894 }
1895
1896 // Vector division, a[i] = b[i] / c[i] (real).
1898 DeviceArray<cudaReal> const & b,
1899 DeviceArray<cudaReal> const & c, const int beginIdA,
1900 const int beginIdB, const int beginIdC, const int n)
1901 {
1902 UTIL_CHECK(a.capacity() >= n + beginIdA);
1903 UTIL_CHECK(b.capacity() >= n + beginIdB);
1904 UTIL_CHECK(c.capacity() >= n + beginIdC);
1905
1906 // GPU resources
1907 int nBlocks, nThreads;
1908 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1909
1910 // Launch kernel
1911 _divVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1912 c.cArray()+beginIdC, n);
1913 cudaErrorCheck( cudaGetLastError() );
1914 }
1915
1916 /*
1917 * Vector division, a[i] = b[i] / c[i] (mixed).
1918 */
1920 DeviceArray<cudaComplex> const & b,
1921 DeviceArray<cudaReal> const & c,
1922 const int beginIdA, const int beginIdB, const int beginIdC,
1923 const int n)
1924 {
1925 UTIL_CHECK(a.capacity() >= n + beginIdA);
1926 UTIL_CHECK(b.capacity() >= n + beginIdB);
1927 UTIL_CHECK(c.capacity() >= n + beginIdC);
1928
1929 // GPU resources
1930 int nBlocks, nThreads;
1931 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1932
1933 // Launch kernel
1934 _divVV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
1935 b.cArray() + beginIdB,
1936 c.cArray() + beginIdC, n);
1937 cudaErrorCheck( cudaGetLastError() );
1938 }
1939
1940 /*
1941 * Vector-scalar division, a[i] = b[i] / c (real).
1942 */
1944 DeviceArray<cudaReal> const & b,
1945 const cudaReal c,
1946 const int beginIdA, const int beginIdB,
1947 const int n)
1948 {
1949 UTIL_CHECK(a.capacity() >= n + beginIdA);
1950 UTIL_CHECK(b.capacity() >= n + beginIdB);
1951
1952 // GPU resources
1953 int nBlocks, nThreads;
1954 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1955
1956 // Launch kernel
1957 _divVS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
1958 b.cArray() + beginIdB,
1959 c, n);
1960 cudaErrorCheck( cudaGetLastError() );
1961 }
1962
1963 /*
1964 * Vector division, a[i] = b[i] / c (mixed, c real).
1965 */
1967 DeviceArray<cudaComplex> const & b,
1968 const cudaReal c,
1969 const int beginIdA, const int beginIdB,
1970 const int n)
1971 {
1972 UTIL_CHECK(a.capacity() >= n + beginIdA);
1973 UTIL_CHECK(b.capacity() >= n + beginIdB);
1974
1975 // GPU resources
1976 int nBlocks, nThreads;
1977 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1978
1979 // Launch kernel
1980 _divVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1981 b.cArray()+beginIdB,
1982 c, n);
1983 cudaErrorCheck( cudaGetLastError() );
1984 }
1985
1986 /*
1987 * Division of scalar by vector, a[i] = b / c[i] (real).
1988 */
1990 const cudaReal b,
1991 DeviceArray<cudaReal> const & c,
1992 const int beginIdA, const int beginIdC, const int n)
1993 {
1994 UTIL_CHECK(a.capacity() >= n + beginIdA);
1995 UTIL_CHECK(c.capacity() >= n + beginIdC);
1996
1997 // GPU resources
1998 int nBlocks, nThreads;
1999 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2000
2001 // Launch kernel
2002 _divSV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b,
2003 c.cArray() + beginIdC, n);
2004 cudaErrorCheck( cudaGetLastError() );
2005 }
2006
2007 // In-place arithmetic operations
2008
2009 /*
2010 * Vector addition in-place, a[i] += b[i] (real).
2011 */
2013 DeviceArray<cudaReal> const & b,
2014 const int beginIdA, const int beginIdB, const int n)
2015 {
2016 UTIL_CHECK(a.capacity() >= n + beginIdA);
2017 UTIL_CHECK(b.capacity() >= n + beginIdB);
2018
2019 // GPU resources
2020 int nBlocks, nThreads;
2021 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2022
2023 // Launch kernel
2024 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2025 b.cArray()+beginIdB, n);
2026 cudaErrorCheck( cudaGetLastError() );
2027 }
2028
2029 /*
2030 * Vector addition in-place, a[i] += b[i] (complex).
2031 */
2033 DeviceArray<cudaComplex> const & b,
2034 const int beginIdA, const int beginIdB, const int n)
2035 {
2036 UTIL_CHECK(a.capacity() >= n + beginIdA);
2037 UTIL_CHECK(b.capacity() >= n + beginIdB);
2038
2039 // GPU resources
2040 int nBlocks, nThreads;
2041 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2042
2043 // Launch kernel
2044 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2045 b.cArray()+beginIdB, n);
2046 cudaErrorCheck( cudaGetLastError() );
2047 }
2048
2049 /*
2050 * Vector addition in-place, a[i] += (b[i], c[i]) (complex, real/imag).
2051 */
2053 DeviceArray<cudaReal> const & b,
2054 DeviceArray<cudaReal> const & c,
2055 const int beginIdA, const int beginIdB, const int beginIdC,
2056 const int n)
2057 {
2058 UTIL_CHECK(a.capacity() >= n + beginIdA);
2059 UTIL_CHECK(b.capacity() >= n + beginIdB);
2060 UTIL_CHECK(c.capacity() >= n + beginIdC);
2061
2062 // GPU resources
2063 int nBlocks, nThreads;
2064 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2065
2066 // Launch kernel
2067 _addEqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2068 b.cArray() + beginIdB,
2069 c.cArray() + beginIdB,
2070 n);
2071 cudaErrorCheck( cudaGetLastError() );
2072 }
2073
2074 /*
2075 * Vector addition in-place, a[i] += b[i] (mixed).
2076 */
2078 DeviceArray<cudaReal> const & b,
2079 const int beginIdA, const int beginIdB,
2080 const int n)
2081 {
2082 UTIL_CHECK(a.capacity() >= n + beginIdA);
2083 UTIL_CHECK(b.capacity() >= n + beginIdB);
2084
2085 // GPU resources
2086 int nBlocks, nThreads;
2087 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2088
2089 // Launch kernel
2090 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2091 b.cArray()+beginIdB, n);
2092 cudaErrorCheck( cudaGetLastError() );
2093 }
2094
2095 /*
2096 * Vector-scalar in-place addition, a[i] += b (real).
2097 */
2099 const cudaReal b,
2100 const int beginIdA, const int n)
2101 {
2102 UTIL_CHECK(a.capacity() >= n + beginIdA);
2103
2104 // GPU resources
2105 int nBlocks, nThreads;
2106 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2107
2108 // Launch kernel
2109 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2110 cudaErrorCheck( cudaGetLastError() );
2111 }
2112
2113 /*
2114 * Vector-scalar in-place addition, a[i] += b (complex).
2115 */
2117 const cudaComplex b,
2118 const int beginIdA, const int n)
2119 {
2120 UTIL_CHECK(a.capacity() >= n + beginIdA);
2121
2122 // GPU resources
2123 int nBlocks, nThreads;
2124 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2125
2126 // Launch kernel
2127 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2128 cudaErrorCheck( cudaGetLastError() );
2129 }
2130
2131 /*
2132 * Vector-scalar in-place addition, a[i] += b (mixed).
2133 */
2135 const cudaReal b,
2136 const int beginIdA, const int n)
2137 {
2138 UTIL_CHECK(a.capacity() >= n + beginIdA);
2139
2140 // GPU resources
2141 int nBlocks, nThreads;
2142 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2143
2144 // Launch kernel
2145 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2146 cudaErrorCheck( cudaGetLastError() );
2147 }
2148
2149 /*
2150 * Vector in-place subtraction, a[i] -= b[i] (real).
2151 */
2153 DeviceArray<cudaReal> const & b,
2154 const int beginIdA, const int beginIdB, const int n)
2155 {
2156 UTIL_CHECK(a.capacity() >= n + beginIdA);
2157 UTIL_CHECK(b.capacity() >= n + beginIdB);
2158
2159 // GPU resources
2160 int nBlocks, nThreads;
2161 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2162
2163 // Launch kernel
2164 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2165 b.cArray()+beginIdB, n);
2166 cudaErrorCheck( cudaGetLastError() );
2167 }
2168
2169 /*
2170 * Vector in-place subtraction, a[i] -= b[i] (complex).
2171 */
2173 DeviceArray<cudaComplex> const & b,
2174 const int beginIdA, const int beginIdB, const int n)
2175 {
2176 UTIL_CHECK(a.capacity() >= n + beginIdA);
2177 UTIL_CHECK(b.capacity() >= n + beginIdB);
2178
2179 // GPU resources
2180 int nBlocks, nThreads;
2181 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2182
2183 // Launch kernel
2184 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2185 b.cArray()+beginIdB, n);
2186 cudaErrorCheck( cudaGetLastError() );
2187 }
2188
2189 /*
2190 * Vector in-place subtraction, a[i] -= b[i] (mixed).
2191 */
2193 DeviceArray<cudaReal> const & b,
2194 const int beginIdA, const int beginIdB, const int n)
2195 {
2196 UTIL_CHECK(a.capacity() >= n + beginIdA);
2197 UTIL_CHECK(b.capacity() >= n + beginIdB);
2198
2199 // GPU resources
2200 int nBlocks, nThreads;
2201 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2202
2203 // Launch kernel
2204 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2205 b.cArray()+beginIdB, n);
2206 cudaErrorCheck( cudaGetLastError() );
2207 }
2208
2209 /*
2210 * Vector-scalar in-place subtraction, a[i] -= b (real).
2211 */
2213 const cudaReal b,
2214 const int beginIdA, const int n)
2215 {
2216 UTIL_CHECK(a.capacity() >= n + beginIdA);
2217
2218 // GPU resources
2219 int nBlocks, nThreads;
2220 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2221
2222 // Launch kernel
2223 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2224 cudaErrorCheck( cudaGetLastError() );
2225 }
2226
2227 /*
2228 * Vector-scalar in-place subtraction, a[i] -= b (complex).
2229 */
2231 const cudaComplex b,
2232 const int beginIdA, const int n)
2233 {
2234 UTIL_CHECK(a.capacity() >= n + beginIdA);
2235
2236 // GPU resources
2237 int nBlocks, nThreads;
2238 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2239
2240 // Launch kernel
2241 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2242 cudaErrorCheck( cudaGetLastError() );
2243 }
2244
2245 /*
2246 * Vector-scalar in-place subtraction, a[i] -= b (mixed).
2247 */
2249 const cudaReal b,
2250 const int beginIdA, const int n)
2251 {
2252 UTIL_CHECK(a.capacity() >= n + beginIdA);
2253
2254 // GPU resources
2255 int nBlocks, nThreads;
2256 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2257
2258 // Launch kernel
2259 _subEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2260 cudaErrorCheck( cudaGetLastError() );
2261 }
2262
2263 // In-place multiplication
2264
2265 // Vector in-place multiplication, a[i] *= b[i] (real).
2267 DeviceArray<cudaReal> const & b,
2268 const int beginIdA, const int beginIdB, const int n)
2269 {
2270 UTIL_CHECK(a.capacity() >= n + beginIdA);
2271 UTIL_CHECK(b.capacity() >= n + beginIdB);
2272
2273 // GPU resources
2274 int nBlocks, nThreads;
2275 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2276
2277 // Launch kernel
2278 _mulEqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2279 b.cArray() + beginIdB, n);
2280 cudaErrorCheck( cudaGetLastError() );
2281 }
2282
2283 // Vector in-place multiplication, a[i] *= b[i] (cudaComplex).
2285 DeviceArray<cudaComplex> const & b,
2286 const int beginIdA, const int beginIdB, const int n)
2287 {
2288 UTIL_CHECK(a.capacity() >= n + beginIdA);
2289 UTIL_CHECK(b.capacity() >= n + beginIdB);
2290
2291 // GPU resources
2292 int nBlocks, nThreads;
2293 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2294
2295 // Launch kernel
2296 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
2297 b.cArray()+beginIdB, n);
2298 cudaErrorCheck( cudaGetLastError() );
2299 }
2300
2301 // Vector in-place multiplication, a[i] *= b[i] (mixed).
2303 DeviceArray<cudaReal> const & b,
2304 const int beginIdA, const int beginIdB, const int n)
2305 {
2306 UTIL_CHECK(a.capacity() >= n + beginIdA);
2307 UTIL_CHECK(b.capacity() >= n + beginIdB);
2308
2309 // GPU resources
2310 int nBlocks, nThreads;
2311 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2312
2313 // Launch kernel
2314 _mulEqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2315 b.cArray() + beginIdB, n);
2316 cudaErrorCheck( cudaGetLastError() );
2317 }
2318
2319 // Vector in-place multiplication, a[i] *= b (cudaReal).
2321 const cudaReal b,
2322 const int beginIdA, const int n)
2323 {
2324 UTIL_CHECK(a.capacity() >= n + beginIdA);
2325
2326 // GPU resources
2327 int nBlocks, nThreads;
2328 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2329
2330 // Launch kernel
2331 _mulEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2332 cudaErrorCheck( cudaGetLastError() );
2333 }
2334
2335 // Vector in-place multiplication, a[i] *= b (cudaComplex).
2337 const cudaComplex b,
2338 const int beginIdA, const int n)
2339 {
2340 UTIL_CHECK(a.capacity() >= n + beginIdA);
2341
2342 // GPU resources
2343 int nBlocks, nThreads;
2344 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2345
2346 // Launch kernel
2347 _mulEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2348 cudaErrorCheck( cudaGetLastError() );
2349 }
2350
2351 /*
2352 * Vector in-place multiplication, a[i] *= b (mixed).
2353 */
2355 const cudaReal b,
2356 const int beginIdA, const int n)
2357 {
2358 UTIL_CHECK(a.capacity() >= n + beginIdA);
2359
2360 // GPU resources
2361 int nBlocks, nThreads;
2362 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2363
2364 // Launch kernel
2365 _mulEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2366 cudaErrorCheck( cudaGetLastError() );
2367 }
2368
2369 // In-place division
2370
2371 /*
2372 * Vector elementwise in-place division, a[i] /= b[i] (real).
2373 */
2375 DeviceArray<cudaReal> const & b,
2376 const int beginIdA, const int beginIdB, const int n)
2377 {
2378 UTIL_CHECK(a.capacity() >= n + beginIdA);
2379 UTIL_CHECK(b.capacity() >= n + beginIdB);
2380
2381 // GPU resources
2382 int nBlocks, nThreads;
2383 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2384
2385 // Launch kernel
2386 _divEqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2387 b.cArray() + beginIdB, n);
2388 cudaErrorCheck( cudaGetLastError() );
2389 }
2390
2391 /*
2392 * Vector elementwise in-place division, a[i] /= b[i] (mixed).
2393 */
2395 DeviceArray<cudaReal> const & b,
2396 const int beginIdA, const int beginIdB, const int n)
2397 {
2398 UTIL_CHECK(a.capacity() >= n + beginIdA);
2399 UTIL_CHECK(b.capacity() >= n + beginIdB);
2400
2401 // GPU resources
2402 int nBlocks, nThreads;
2403 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2404
2405 // Launch kernel
2406 _divEqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2407 b.cArray() + beginIdB, n);
2408 cudaErrorCheck( cudaGetLastError() );
2409 }
2410
2411 /*
2412 * Vector-scalar in-place division, a[i] /= b (real).
2413 */
2415 const cudaReal b,
2416 const int beginIdA, const int n)
2417 {
2418 UTIL_CHECK(a.capacity() >= n + beginIdA);
2419
2420 // GPU resources
2421 int nBlocks, nThreads;
2422 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2423
2424 // Launch kernel
2425 _divEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2426 cudaErrorCheck( cudaGetLastError() );
2427 }
2428
2429 /*
2430 * Vector-scalar in-place division, a[i] /= b (mixed).
2431 */
2433 const cudaReal b,
2434 const int beginIdA, const int n)
2435 {
2436 UTIL_CHECK(a.capacity() >= n + beginIdA);
2437
2438 // GPU resources
2439 int nBlocks, nThreads;
2440 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2441
2442 // Launch kernel
2443 _divEqS<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b, n);
2444 cudaErrorCheck( cudaGetLastError() );
2445 }
2446
2447 // Exponentiation
2448
2449 /*
2450 * Vector exponentiation, a[i] = exp(b[i]) (real).
2451 */
2453 DeviceArray<cudaReal> const & b,
2454 const int beginIdA, const int beginIdB, const int n)
2455 {
2456 UTIL_CHECK(a.capacity() >= n + beginIdA);
2457 UTIL_CHECK(b.capacity() >= n + beginIdB);
2458
2459 // GPU resources
2460 int nBlocks, nThreads;
2461 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2462
2463 // Launch kernel
2464 _expV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2465 b.cArray() + beginIdB, n);
2466 cudaErrorCheck( cudaGetLastError() );
2467 }
2468
2469 /*
2470 * Vector exponentiation, a[i] = exp(b[i]) (complex).
2471 */
2473 DeviceArray<cudaComplex> const & b,
2474 const int beginIdA, const int beginIdB, const int n)
2475 {
2476 UTIL_CHECK(a.capacity() >= n + beginIdA);
2477 UTIL_CHECK(b.capacity() >= n + beginIdB);
2478
2479 // GPU resources
2480 int nBlocks, nThreads;
2481 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2482
2483 // Launch kernel
2484 _expV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2485 b.cArray() + beginIdB, n);
2486 cudaErrorCheck( cudaGetLastError() );
2487 }
2488
2489 // Elementwise arithmetic square
2490
2491 /*
2492 * Vector elementwise square, a[i] = b[i]*b[i] (real).
2493 */
2495 DeviceArray<cudaReal> const & b,
2496 const int beginIdA, const int beginIdB,
2497 const int n)
2498 {
2499 UTIL_CHECK(a.capacity() >= n + beginIdA);
2500 UTIL_CHECK(b.capacity() >= n + beginIdB);
2501
2502 // GPU resources
2503 int nBlocks, nThreads;
2504 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2505
2506 // Launch kernel
2507 _sqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2508 b.cArray() + beginIdB, n);
2509 cudaErrorCheck( cudaGetLastError() );
2510 }
2511
2512 /*
2513 * Vector elementwise square, a[i] = b[i]*b[i] (complex).
2514 */
2516 DeviceArray<cudaComplex> const & b,
2517 const int beginIdA, const int beginIdB,
2518 const int n)
2519 {
2520 UTIL_CHECK(a.capacity() >= n + beginIdA);
2521 UTIL_CHECK(b.capacity() >= n + beginIdB);
2522
2523 // GPU resources
2524 int nBlocks, nThreads;
2525 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2526
2527 // Launch kernel
2528 _sqV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2529 b.cArray() + beginIdB, n);
2530 cudaErrorCheck( cudaGetLastError() );
2531 }
2532
2533 // Elementwise absolute magnitudes
2534
2535 /*
2536 * Vector absolute magnitude, a[i] = abs(b[i]) (real).
2537 */
2539 DeviceArray<cudaReal> const & b,
2540 const int beginIdA, const int beginIdB,
2541 const int n)
2542 {
2543 UTIL_CHECK(a.capacity() >= n + beginIdA);
2544 UTIL_CHECK(b.capacity() >= n + beginIdB);
2545
2546 // GPU resources
2547 int nBlocks, nThreads;
2548 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2549
2550 // Launch kernel
2551 _absV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2552 b.cArray() + beginIdB, n);
2553 cudaErrorCheck( cudaGetLastError() );
2554 }
2555
2556 /*
2557 * Vector absolute magnitude squared, a[i] = |b[i]|^2 (complex).
2558 */
2560 DeviceArray<cudaComplex> const & b,
2561 const int beginIdA, const int beginIdB,
2562 const int n)
2563 {
2564 UTIL_CHECK(a.capacity() >= n + beginIdA);
2565 UTIL_CHECK(b.capacity() >= n + beginIdB);
2566
2567 // GPU resources
2568 int nBlocks, nThreads;
2569 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2570
2571 // Launch kernel
2572 _sqAbsV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA,
2573 b.cArray() + beginIdB, n);
2574 cudaErrorCheck( cudaGetLastError() );
2575 }
2576
2577} // namespace VecOp
2578} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition DeviceArray.h:96
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Array container class template.
Definition Array.h:40
Data * cArray()
Return a pointer to the underlying C array.
Definition Array.h:199
int capacity() const
Return allocated size.
Definition Array.h:144
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
void divEqS(Array< double > &a, double b)
Vector-scalar in-place division, a[i] /= b.
Definition VecOp.cpp:292
void addEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place addition, a[i] += b[i] (real).
Definition VecOp.cpp:198
void divEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place division, a[i] /= b[i].
Definition VecOp.cpp:279
void addEqS(Array< double > &a, double b)
Vector-scalar in-place addition, a[i] += b (real).
Definition VecOp.cpp:211
void sqV(Array< double > &a, Array< double > const &b)
Vector element-wise square, a[i] = b[i]*b[i] (real).
Definition VecOp.cpp:334
void mulEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place multiplication, a[i] *= b[i] (real).
Definition VecOp.cpp:252
void eqV(Array< double > &a, Array< double > const &b, const int beginIdA, const int beginIdB, const int n)
Vector assignment, a[i] = b[i] (real, slice).
Definition VecOp.cpp:21
void sqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Square of absolute magnitude, a[i] = |b[i]|^2 (complex).
Definition VecOpCx.cpp:698
void mulEqS(Array< double > &a, double b)
Vector-scalar in-place multiplication, a[i] *= b (real).
Definition VecOp.cpp:265
void subVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector subtraction, a[i] = b[i] - c[i] (real)
Definition VecOp.cpp:95
void absV(Array< double > &a, Array< double > const &b)
Element-wise absolute magnitude, a[i] = abs(b[i]) (real).
Definition VecOp.cpp:349
void expV(Array< double > &a, Array< double > const &b)
Vector exponentiation, a[i] = exp(b[i]) (real).
Definition VecOp.cpp:306
void divVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar division, a[i] = b[i] / c (real).
Definition VecOp.cpp:170
void eqS(Array< double > &a, double b)
Vector assignment, a[i] = b (real).
Definition VecOp.cpp:50
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
Definition VecOp.cpp:125
void subVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar subtraction, a[i] = b[i] - c (real).
Definition VecOp.cpp:110
void subEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place subtraction, a[i] -= b[i] (real).
Definition VecOp.cpp:225
void addVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector addition, a[i] = b[i] + c[i] (real)
Definition VecOp.cpp:64
void divSV(Array< double > &a, double b, Array< double > const &c)
Vector division, a[i] = b / c[i].
Definition VecOp.cpp:183
void addVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar addition, a[i] = b[i] + c (real).
Definition VecOp.cpp:79
void divVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector division, a[i] = b[i] / c[i] (real).
Definition VecOp.cpp:155
void subEqS(Array< double > &a, double b)
Vector-scalar subtraction in-place, a[i] -= b (real).
Definition VecOp.cpp:238
void mulVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar multiplication, a[i] = b[i] * c (real).
Definition VecOp.cpp:140
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
Vector operations on GPU or CPU.
Definition VecOp.cpp:14
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
Definition cudaTypes.h:22
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.
Definition cudaTypes.h:35