PSCF v1.3
VecOp.cu
1/*
2* PSCF - Polymer Self-Consistent Field
3*
4* Copyright 2015 - 2025, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "VecOp.h"
9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/cudaErrorCheck.h>
11#include <cmath>
12
13namespace Pscf {
14namespace Prdc {
15namespace Cuda {
16namespace VecOp {
17
18// CUDA kernels:
19// (defined in anonymous namespace, used within this file only)
20
21namespace {
22
23 /*
24 * Vector assignment, a[i] = b[i], GPU kernel (cudaReal).
25 *
26 * \param a output array (LHS)
27 * \param b input array (RHS)
28 * \param n size of arrays
29 */
30 __global__ void _eqV(cudaReal* a, cudaReal const * b, const int n)
31 {
32 int nThreads = blockDim.x * gridDim.x;
33 int startID = blockIdx.x * blockDim.x + threadIdx.x;
34 for (int i = startID; i < n; i += nThreads) {
35 a[i] = b[i];
36 }
37 }
38
39 /*
40 * Vector assignment, a[i] = b[i], GPU kernel (cudaComplex).
41 *
42 * \param a output array (LHS)
43 * \param b input array (RHS)
44 * \param n size of arrays
45 */
46 __global__ void _eqV(cudaComplex* a, cudaComplex const * b, const int n)
47 {
48 int nThreads = blockDim.x * gridDim.x;
49 int startID = blockIdx.x * blockDim.x + threadIdx.x;
50 for (int i = startID; i < n; i += nThreads) {
51 a[i].x = b[i].x;
52 a[i].y = b[i].y;
53 }
54 }
55
56 /*
57 * Vector assignment, a[i] = b, GPU kernel (cudaReal).
58 *
59 * \param a output array (LHS)
60 * \param b input scalar (RHS)
61 * \param n size of arrays
62 */
63 __global__ void _eqS(cudaReal* a, const cudaReal b, const int n)
64 {
65 int nThreads = blockDim.x * gridDim.x;
66 int startID = blockIdx.x * blockDim.x + threadIdx.x;
67 for (int i = startID; i < n; i += nThreads) {
68 a[i] = b;
69 }
70 }
71
72 /*
73 * Vector assignment, a[i] = b, GPU kernel (cudaComplex).
74 *
75 * \param a output array (LHS)
76 * \param b input scalar (RHS)
77 * \param n size of arrays
78 */
79 __global__ void _eqS(cudaComplex* a, const cudaComplex b, const int n)
80 {
81 int nThreads = blockDim.x * gridDim.x;
82 int startID = blockIdx.x * blockDim.x + threadIdx.x;
83 for (int i = startID; i < n; i += nThreads) {
84 a[i].x = b.x;
85 a[i].y = b.y;
86 }
87 }
88
89 /*
90 * Vector addition, a[i] = b[i] + c[i], GPU kernel (cudaReal).
91 *
92 * \param a output array (LHS)
93 * \param b input array (RHS)
94 * \param c input array (RHS)
95 * \param n size of arrays
96 */
97 __global__ void _addVV(cudaReal* a, cudaReal const * b,
98 cudaReal const * c, const int n)
99 {
100 int nThreads = blockDim.x * gridDim.x;
101 int startID = blockIdx.x * blockDim.x + threadIdx.x;
102 for (int i = startID; i < n; i += nThreads) {
103 a[i] = b[i] + c[i];
104 }
105 }
106
107 /*
108 * Vector addition, a[i] = b[i] + c[i], GPU kernel (cudaComplex).
109 *
110 * \param a output array (LHS)
111 * \param b input array (RHS)
112 * \param c input array (RHS)
113 * \param n size of arrays
114 */
115 __global__ void _addVV(cudaComplex* a, cudaComplex const * b,
116 cudaComplex const * c, const int n)
117 {
118 int nThreads = blockDim.x * gridDim.x;
119 int startID = blockIdx.x * blockDim.x + threadIdx.x;
120 for (int i = startID; i < n; i += nThreads) {
121 a[i].x = b[i].x + c[i].x;
122 a[i].y = b[i].y + c[i].y;
123 }
124 }
125
126 /*
127 * Vector addition, a[i] = b[i] + c[i], GPU kernel (mixed, b = real).
128 *
129 * \param a output array (LHS)
130 * \param b input array (RHS)
131 * \param c input array (RHS)
132 * \param n size of arrays
133 */
134 __global__ void _addVV(cudaComplex* a, cudaReal const * b,
135 cudaComplex const * c, const int n)
136 {
137 int nThreads = blockDim.x * gridDim.x;
138 int startID = blockIdx.x * blockDim.x + threadIdx.x;
139 for (int i = startID; i < n; i += nThreads) {
140 a[i].x = b[i] + c[i].x;
141 a[i].y = c[i].y;
142 }
143 }
144
145 /*
146 * Vector addition, a[i] = b[i] + c[i], GPU kernel (mixed, c = real).
147 *
148 * \param a output array (LHS)
149 * \param b input array (RHS)
150 * \param c input array (RHS)
151 * \param n size of arrays
152 */
153 __global__ void _addVV(cudaComplex* a, cudaComplex const * b,
154 cudaReal const * c, const int n)
155 {
156 int nThreads = blockDim.x * gridDim.x;
157 int startID = blockIdx.x * blockDim.x + threadIdx.x;
158 for (int i = startID; i < n; i += nThreads) {
159 a[i].x = b[i].x + c[i];
160 a[i].y = b[i].y;
161 }
162 }
163
164 /*
165 * Vector addition, a[i] = b[i] + c, GPU kernel (cudaReal).
166 *
167 * \param a output array (LHS)
168 * \param b input array (RHS)
169 * \param c input scalar (RHS)
170 * \param n size of arrays
171 */
172 __global__ void _addVS(cudaReal* a, cudaReal const * b,
173 const cudaReal c, const int n)
174 {
175 int nThreads = blockDim.x * gridDim.x;
176 int startID = blockIdx.x * blockDim.x + threadIdx.x;
177 for (int i = startID; i < n; i += nThreads) {
178 a[i] = b[i] + c;
179 }
180 }
181
182 /*
183 * Vector addition, a[i] = b[i] + c, GPU kernel (cudaComplex).
184 *
185 * \param a output array (LHS)
186 * \param b input array (RHS)
187 * \param c input scalar (RHS)
188 * \param n size of arrays
189 */
190 __global__ void _addVS(cudaComplex* a, cudaComplex const * b,
191 const cudaComplex c, const int n)
192 {
193 int nThreads = blockDim.x * gridDim.x;
194 int startID = blockIdx.x * blockDim.x + threadIdx.x;
195 for (int i = startID; i < n; i += nThreads) {
196 a[i].x = b[i].x + c.x;
197 a[i].y = b[i].y + c.y;
198 }
199 }
200
201 /*
202 * Vector addition, a[i] = b[i] + c, GPU kernel (mixed, b = real).
203 *
204 * \param a output array (LHS)
205 * \param b input array (RHS)
206 * \param c input scalar (RHS)
207 * \param n size of arrays
208 */
209 __global__ void _addVS(cudaComplex* a, cudaReal const * b,
210 const cudaComplex c, const int n)
211 {
212 int nThreads = blockDim.x * gridDim.x;
213 int startID = blockIdx.x * blockDim.x + threadIdx.x;
214 for (int i = startID; i < n; i += nThreads) {
215 a[i].x = b[i] + c.x;
216 a[i].y = c.y;
217 }
218 }
219
220 /*
221 * Vector addition, a[i] = b[i] + c, GPU kernel (mixed, c = real).
222 *
223 * \param a output array (LHS)
224 * \param b input array (RHS)
225 * \param c input scalar (RHS)
226 * \param n size of arrays
227 */
228 __global__ void _addVS(cudaComplex* a, cudaComplex const * b,
229 const cudaReal c, const int n)
230 {
231 int nThreads = blockDim.x * gridDim.x;
232 int startID = blockIdx.x * blockDim.x + threadIdx.x;
233 for (int i = startID; i < n; i += nThreads) {
234 a[i].x = b[i].x + c;
235 a[i].y = b[i].y;
236 }
237 }
238
239 /*
240 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (cudaReal).
241 *
242 * \param a output array (LHS)
243 * \param b input array (RHS)
244 * \param c input array (RHS)
245 * \param n size of arrays
246 */
247 __global__ void _subVV(cudaReal* a, cudaReal const * b,
248 cudaReal const * c, const int n)
249 {
250 int nThreads = blockDim.x * gridDim.x;
251 int startID = blockIdx.x * blockDim.x + threadIdx.x;
252 for (int i = startID; i < n; i += nThreads) {
253 a[i] = b[i] - c[i];
254 }
255 }
256
257 /*
258 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (cudaComplex).
259 *
260 * \param a output array (LHS)
261 * \param b input array (RHS)
262 * \param c input array (RHS)
263 * \param n size of arrays
264 */
265 __global__ void _subVV(cudaComplex* a, cudaComplex const * b,
266 cudaComplex const * c, const int n)
267 {
268 int nThreads = blockDim.x * gridDim.x;
269 int startID = blockIdx.x * blockDim.x + threadIdx.x;
270 for (int i = startID; i < n; i += nThreads) {
271 a[i].x = b[i].x - c[i].x;
272 a[i].y = b[i].y - c[i].y;
273 }
274 }
275
276 /*
277 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (mixed, b = real).
278 *
279 * \param a output array (LHS)
280 * \param b input array (RHS)
281 * \param c input array (RHS)
282 * \param n size of arrays
283 */
284 __global__ void _subVV(cudaComplex* a, cudaReal const * b,
285 cudaComplex const * c, const int n)
286 {
287 int nThreads = blockDim.x * gridDim.x;
288 int startID = blockIdx.x * blockDim.x + threadIdx.x;
289 for (int i = startID; i < n; i += nThreads) {
290 a[i].x = b[i] - c[i].x;
291 a[i].y = 0.0 - c[i].y;
292 }
293 }
294
295 /*
296 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (mixed, c = real).
297 *
298 * \param a output array (LHS)
299 * \param b input array (RHS)
300 * \param c input array (RHS)
301 * \param n size of arrays
302 */
303 __global__ void _subVV(cudaComplex* a, cudaComplex const * b,
304 cudaReal const * c, const int n)
305 {
306 int nThreads = blockDim.x * gridDim.x;
307 int startID = blockIdx.x * blockDim.x + threadIdx.x;
308 for (int i = startID; i < n; i += nThreads) {
309 a[i].x = b[i].x - c[i];
310 a[i].y = b[i].y;
311 }
312 }
313
314 /*
315 * Vector subtraction, a[i] = b[i] - c, GPU kernel (cudaReal).
316 *
317 * \param a output array (LHS)
318 * \param b input array (RHS)
319 * \param c input scalar (RHS)
320 * \param n size of arrays
321 */
322 __global__ void _subVS(cudaReal* a, cudaReal const * b,
323 const cudaReal c, const int n)
324 {
325 int nThreads = blockDim.x * gridDim.x;
326 int startID = blockIdx.x * blockDim.x + threadIdx.x;
327 for (int i = startID; i < n; i += nThreads) {
328 a[i] = b[i] - c;
329 }
330 }
331
332 /*
333 * Vector subtraction, a[i] = b[i] - c, GPU kernel (cudaComplex).
334 *
335 * \param a output array (LHS)
336 * \param b input array (RHS)
337 * \param c input scalar (RHS)
338 * \param n size of arrays
339 */
340 __global__ void _subVS(cudaComplex* a, cudaComplex const * b,
341 const cudaComplex c, const int n)
342 {
343 int nThreads = blockDim.x * gridDim.x;
344 int startID = blockIdx.x * blockDim.x + threadIdx.x;
345 for (int i = startID; i < n; i += nThreads) {
346 a[i].x = b[i].x - c.x;
347 a[i].y = b[i].y - c.y;
348 }
349 }
350
351 /*
352 * Vector subtraction, a[i] = b[i] - c, GPU kernel (mixed, b = real).
353 *
354 * \param a output array (LHS)
355 * \param b input array (RHS)
356 * \param c input scalar (RHS)
357 * \param n size of arrays
358 */
359 __global__ void _subVS(cudaComplex* a, cudaReal const * b,
360 const cudaComplex c, const int n)
361 {
362 int nThreads = blockDim.x * gridDim.x;
363 int startID = blockIdx.x * blockDim.x + threadIdx.x;
364 for (int i = startID; i < n; i += nThreads) {
365 a[i].x = b[i] - c.x;
366 a[i].y = 0.0 - c.y;
367 }
368 }
369
370 /*
371 * Vector subtraction, a[i] = b[i] - c, GPU kernel (mixed, c = real).
372 *
373 * \param a output array (LHS)
374 * \param b input array (RHS)
375 * \param c input scalar (RHS)
376 * \param n size of arrays
377 */
378 __global__ void _subVS(cudaComplex* a, cudaComplex const * b,
379 const cudaReal c, const int n)
380 {
381 int nThreads = blockDim.x * gridDim.x;
382 int startID = blockIdx.x * blockDim.x + threadIdx.x;
383 for (int i = startID; i < n; i += nThreads) {
384 a[i].x = b[i].x - c;
385 a[i].y = b[i].y;
386 }
387 }
388
389 /*
390 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (cudaReal).
391 *
392 * \param a output array (LHS)
393 * \param b input array (RHS)
394 * \param c input array (RHS)
395 * \param n size of arrays
396 */
397 __global__ void _mulVV(cudaReal* a, cudaReal const * b,
398 cudaReal const * c, const int n)
399 {
400 int nThreads = blockDim.x * gridDim.x;
401 int startID = blockIdx.x * blockDim.x + threadIdx.x;
402 for (int i = startID; i < n; i += nThreads) {
403 a[i] = b[i] * c[i];
404 }
405 }
406
407 /*
408 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (cudaComplex).
409 *
410 * \param a output array (LHS)
411 * \param b input array (RHS)
412 * \param c input array (RHS)
413 * \param n size of arrays
414 */
415 __global__ void _mulVV(cudaComplex* a, cudaComplex const * b,
416 cudaComplex const * c, const int n)
417 {
418 int nThreads = blockDim.x * gridDim.x;
419 int startID = blockIdx.x * blockDim.x + threadIdx.x;
420 for (int i = startID; i < n; i += nThreads) {
421 a[i].x = (b[i].x * c[i].x) - (b[i].y * c[i].y);
422 a[i].y = (b[i].x * c[i].y) + (b[i].y * c[i].x);
423 }
424 }
425
426 /*
427 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (mixed, b = real).
428 *
429 * \param a output array (LHS)
430 * \param b input array (RHS)
431 * \param c input array (RHS)
432 * \param n size of arrays
433 */
434 __global__ void _mulVV(cudaComplex* a, cudaReal const * b,
435 cudaComplex const * c, const int n)
436 {
437 int nThreads = blockDim.x * gridDim.x;
438 int startID = blockIdx.x * blockDim.x + threadIdx.x;
439 for (int i = startID; i < n; i += nThreads) {
440 a[i].x = b[i] * c[i].x;
441 a[i].y = b[i] * c[i].y;
442 }
443 }
444
445 /*
446 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (mixed, c = real).
447 *
448 * \param a output array (LHS)
449 * \param b input array (RHS)
450 * \param c input array (RHS)
451 * \param n size of arrays
452 */
453 __global__ void _mulVV(cudaComplex* a, cudaComplex const * b,
454 cudaReal const * c, const int n)
455 {
456 int nThreads = blockDim.x * gridDim.x;
457 int startID = blockIdx.x * blockDim.x + threadIdx.x;
458 for (int i = startID; i < n; i += nThreads) {
459 a[i].x = b[i].x * c[i];
460 a[i].y = b[i].y * c[i];
461 }
462 }
463
464 /*
465 * Vector multiplication, a[i] = b[i] * c, GPU kernel (cudaReal).
466 *
467 * \param a output array (LHS)
468 * \param b input array (RHS)
469 * \param c input scalar (RHS)
470 * \param n size of arrays
471 */
472 __global__ void _mulVS(cudaReal* a, cudaReal const * b,
473 const cudaReal c, const int n)
474 {
475 int nThreads = blockDim.x * gridDim.x;
476 int startID = blockIdx.x * blockDim.x + threadIdx.x;
477 for (int i = startID; i < n; i += nThreads) {
478 a[i] = b[i] * c;
479 }
480 }
481
482 /*
483 * Vector multiplication, a[i] = b[i] * c, GPU kernel (cudaComplex).
484 *
485 * \param a output array (LHS)
486 * \param b input array (RHS)
487 * \param c input scalar (RHS)
488 * \param n size of arrays
489 */
490 __global__ void _mulVS(cudaComplex* a, cudaComplex const * b,
491 const cudaComplex c, const int n)
492 {
493 int nThreads = blockDim.x * gridDim.x;
494 int startID = blockIdx.x * blockDim.x + threadIdx.x;
495 for (int i = startID; i < n; i += nThreads) {
496 a[i].x = (b[i].x * c.x) - (b[i].y * c.y);
497 a[i].y = (b[i].x * c.y) + (b[i].y * c.x);
498 }
499 }
500
501 /*
502 * Vector multiplication, a[i] = b[i] * c, GPU kernel (mixed, b = real).
503 *
504 * \param a output array (LHS)
505 * \param b input array (RHS)
506 * \param c input scalar (RHS)
507 * \param n size of arrays
508 */
509 __global__ void _mulVS(cudaComplex* a, cudaReal const * b,
510 const cudaComplex c, const int n)
511 {
512 int nThreads = blockDim.x * gridDim.x;
513 int startID = blockIdx.x * blockDim.x + threadIdx.x;
514 for (int i = startID; i < n; i += nThreads) {
515 a[i].x = b[i] * c.x;
516 a[i].y = b[i] * c.y;
517 }
518 }
519
520 /*
521 * Vector multiplication, a[i] = b[i] * c, GPU kernel (mixed, c = real).
522 *
523 * \param a output array (LHS)
524 * \param b input array (RHS)
525 * \param c input scalar (RHS)
526 * \param n size of arrays
527 */
528 __global__ void _mulVS(cudaComplex* a, cudaComplex const * b,
529 const cudaReal c, const int n)
530 {
531 int nThreads = blockDim.x * gridDim.x;
532 int startID = blockIdx.x * blockDim.x + threadIdx.x;
533 for (int i = startID; i < n; i += nThreads) {
534 a[i].x = b[i].x * c;
535 a[i].y = b[i].y * c;
536 }
537 }
538
539 /*
540 * Vector division, a[i] = b[i] / c[i], GPU kernel (cudaReal).
541 *
542 * \param a output array (LHS)
543 * \param b input array (RHS)
544 * \param c input array (RHS)
545 * \param n size of arrays
546 */
547 __global__ void _divVV(cudaReal* a, cudaReal const * b,
548 cudaReal const * c, const int n)
549 {
550 int nThreads = blockDim.x * gridDim.x;
551 int startID = blockIdx.x * blockDim.x + threadIdx.x;
552 for (int i = startID; i < n; i += nThreads) {
553 a[i] = b[i] / c[i];
554 }
555 }
556
557 /*
558 * Vector division, a[i] = b[i] / c[i], GPU kernel (mixed, c = real).
559 *
560 * \param a output array (LHS)
561 * \param b input array (RHS)
562 * \param c input array (RHS)
563 * \param n size of arrays
564 */
565 __global__ void _divVV(cudaComplex* a, cudaComplex const * b,
566 cudaReal const * c, const int n)
567 {
568 int nThreads = blockDim.x * gridDim.x;
569 int startID = blockIdx.x * blockDim.x + threadIdx.x;
570 for (int i = startID; i < n; i += nThreads) {
571 a[i].x = b[i].x / c[i];
572 a[i].y = b[i].y / c[i];
573 }
574 }
575
576 /*
577 * Vector division, a[i] = b[i] / c, GPU kernel (cudaReal).
578 *
579 * \param a output array (LHS)
580 * \param b input array (RHS)
581 * \param c input scalar (RHS)
582 * \param n size of arrays
583 */
584 __global__ void _divVS(cudaReal* a, cudaReal const * b,
585 const cudaReal c, const int n)
586 {
587 int nThreads = blockDim.x * gridDim.x;
588 int startID = blockIdx.x * blockDim.x + threadIdx.x;
589 for (int i = startID; i < n; i += nThreads) {
590 a[i] = b[i] / c;
591 }
592 }
593
594 /*
595 * Vector division, a[i] = b[i] / c, GPU kernel (mixed, c = real).
596 *
597 * \param a output array (LHS)
598 * \param b input array (RHS)
599 * \param c input scalar (RHS)
600 * \param n size of arrays
601 */
602 __global__ void _divVS(cudaComplex* a, cudaComplex const * b,
603 const cudaReal c, const int n)
604 {
605 int nThreads = blockDim.x * gridDim.x;
606 int startID = blockIdx.x * blockDim.x + threadIdx.x;
607 for (int i = startID; i < n; i += nThreads) {
608 a[i].x = b[i].x / c;
609 a[i].y = b[i].y / c;
610 }
611 }
612
613 /*
614 * Vector division, a[i] = b / c[i], GPU kernel (cudaReal).
615 *
616 * \param a output array (LHS)
617 * \param b input scalar (RHS)
618 * \param c input array (RHS)
619 * \param n size of arrays
620 */
621 __global__ void _divSV(cudaReal* a, const cudaReal b,
622 cudaReal const * c, const int n)
623 {
624 int nThreads = blockDim.x * gridDim.x;
625 int startID = blockIdx.x * blockDim.x + threadIdx.x;
626 for (int i = startID; i < n; i += nThreads) {
627 a[i] = b / c[i];
628 }
629 }
630
631 /*
632 * Vector exponentiation, a[i] = exp(b[i]), GPU kernel (cudaReal).
633 *
634 * \param a output array (LHS)
635 * \param b input array (RHS)
636 * \param n size of arrays
637 */
638 __global__ void _expV(cudaReal* a, cudaReal const * b, const int n)
639 {
640 int nThreads = blockDim.x * gridDim.x;
641 int startID = blockIdx.x * blockDim.x + threadIdx.x;
642 for (int i = startID; i < n; i += nThreads) {
643 a[i] = exp(b[i]);
644 }
645 }
646
647 /*
648 * Vector exponentiation, a[i] = exp(b[i]), GPU kernel (cudaComplex).
649 *
650 * \param a output array (LHS)
651 * \param b input array (RHS)
652 * \param n size of arrays
653 */
654 __global__ void _expV(cudaComplex* a, cudaComplex const * b, const int n)
655 {
656 int nThreads = blockDim.x * gridDim.x;
657 int startID = blockIdx.x * blockDim.x + threadIdx.x;
658 for (int i = startID; i < n; i += nThreads) {
659 a[i].x = exp(b[i].x) * cos(b[i].y);
660 a[i].y = exp(b[i].x) * sin(b[i].y);
661 }
662 }
663
664 /*
665 * Vector addition in-place, a[i] += b[i], GPU kernel (cudaReal).
666 *
667 * \param a output array (LHS)
668 * \param b input array (RHS)
669 * \param n size of arrays
670 */
671 __global__ void _addEqV(cudaReal* a, cudaReal const * b, const int n)
672 {
673 int nThreads = blockDim.x * gridDim.x;
674 int startID = blockIdx.x * blockDim.x + threadIdx.x;
675 for (int i = startID; i < n; i += nThreads) {
676 a[i] += b[i];
677 }
678 }
679
680 /*
681 * Vector addition in-place, a[i] += b[i], GPU kernel (cudaComplex).
682 *
683 * \param a output array (LHS)
684 * \param b input array (RHS)
685 * \param n size of arrays
686 */
687 __global__ void _addEqV(cudaComplex* a, cudaComplex const * b, const int n)
688 {
689 int nThreads = blockDim.x * gridDim.x;
690 int startID = blockIdx.x * blockDim.x + threadIdx.x;
691 for (int i = startID; i < n; i += nThreads) {
692 a[i].x += b[i].x;
693 a[i].y += b[i].y;
694 }
695 }
696
697 /*
698 * Vector addition in-place, a[i] += b[i], GPU kernel (mixed, b = real).
699 *
700 * \param a output array (LHS)
701 * \param b input array (RHS)
702 * \param n size of arrays
703 */
704 __global__ void _addEqV(cudaComplex* a, cudaReal const * b, const int n)
705 {
706 int nThreads = blockDim.x * gridDim.x;
707 int startID = blockIdx.x * blockDim.x + threadIdx.x;
708 for (int i = startID; i < n; i += nThreads) {
709 a[i].x += b[i];
710 }
711 }
712
713 /*
714 * Vector addition in-place, a[i] += b, GPU kernel (cudaReal).
715 *
716 * \param a output array (LHS)
717 * \param b input scalar (RHS)
718 * \param n size of arrays
719 */
720 __global__ void _addEqS(cudaReal* a, const cudaReal b, const int n)
721 {
722 int nThreads = blockDim.x * gridDim.x;
723 int startID = blockIdx.x * blockDim.x + threadIdx.x;
724 for (int i = startID; i < n; i += nThreads) {
725 a[i] += b;
726 }
727 }
728
729 /*
730 * Vector addition in-place, a[i] += b, GPU kernel (cudaComplex).
731 *
732 * \param a output array (LHS)
733 * \param b input scalar (RHS)
734 * \param n size of arrays
735 */
736 __global__ void _addEqS(cudaComplex* a, const cudaComplex b,
737 const int n)
738 {
739 int nThreads = blockDim.x * gridDim.x;
740 int startID = blockIdx.x * blockDim.x + threadIdx.x;
741 for (int i = startID; i < n; i += nThreads) {
742 a[i].x += b.x;
743 a[i].y += b.y;
744 }
745 }
746
747 /*
748 * Vector addition in-place, a[i] += b, GPU kernel (mixed, b = real).
749 *
750 * \param a output array (LHS)
751 * \param b input scalar (RHS)
752 * \param n size of arrays
753 */
754 __global__ void _addEqS(cudaComplex* a, const cudaReal b, const int n)
755 {
756 int nThreads = blockDim.x * gridDim.x;
757 int startID = blockIdx.x * blockDim.x + threadIdx.x;
758 for (int i = startID; i < n; i += nThreads) {
759 a[i].x += b;
760 }
761 }
762
763 /*
764 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (cudaReal).
765 *
766 * \param a output array (LHS)
767 * \param b input array (RHS)
768 * \param n size of arrays
769 */
770 __global__ void _subEqV(cudaReal* a, cudaReal const * b, const int n)
771 {
772 int nThreads = blockDim.x * gridDim.x;
773 int startID = blockIdx.x * blockDim.x + threadIdx.x;
774 for (int i = startID; i < n; i += nThreads) {
775 a[i] -= b[i];
776 }
777 }
778
779 /*
780 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (cudaComplex).
781 *
782 * \param a output array (LHS)
783 * \param b input array (RHS)
784 * \param n size of arrays
785 */
786 __global__ void _subEqV(cudaComplex* a, cudaComplex const * b, const int n)
787 {
788 int nThreads = blockDim.x * gridDim.x;
789 int startID = blockIdx.x * blockDim.x + threadIdx.x;
790 for (int i = startID; i < n; i += nThreads) {
791 a[i].x -= b[i].x;
792 a[i].y -= b[i].y;
793 }
794 }
795
796 /*
797 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (mixed, b = real).
798 *
799 * \param a output array (LHS)
800 * \param b input array (RHS)
801 * \param n size of arrays
802 */
803 __global__ void _subEqV(cudaComplex* a, cudaReal const * b, const int n)
804 {
805 int nThreads = blockDim.x * gridDim.x;
806 int startID = blockIdx.x * blockDim.x + threadIdx.x;
807 for (int i = startID; i < n; i += nThreads) {
808 a[i].x -= b[i];
809 }
810 }
811
812 /*
813 * Vector subtraction in-place, a[i] -= b, GPU kernel (cudaReal).
814 *
815 * \param a output array (LHS)
816 * \param b input scalar (RHS)
817 * \param n size of arrays
818 */
819 __global__ void _subEqS(cudaReal* a, const cudaReal b, const int n)
820 {
821 int nThreads = blockDim.x * gridDim.x;
822 int startID = blockIdx.x * blockDim.x + threadIdx.x;
823 for (int i = startID; i < n; i += nThreads) {
824 a[i] -= b;
825 }
826 }
827
828 /*
829 * Vector subtraction in-place, a[i] -= b, GPU kernel (cudaComplex).
830 *
831 * \param a output array (LHS)
832 * \param b input scalar (RHS)
833 * \param n size of arrays
834 */
835 __global__ void _subEqS(cudaComplex* a, const cudaComplex b, const int n)
836 {
837 int nThreads = blockDim.x * gridDim.x;
838 int startID = blockIdx.x * blockDim.x + threadIdx.x;
839 for (int i = startID; i < n; i += nThreads) {
840 a[i].x -= b.x;
841 a[i].y -= b.y;
842 }
843 }
844
845 /*
846 * Vector subtraction in-place, a[i] -= b, GPU kernel (mixed, b = real).
847 *
848 * \param a output array (LHS)
849 * \param b input scalar (RHS)
850 * \param n size of arrays
851 */
852 __global__ void _subEqS(cudaComplex* a, const cudaReal b, const int n)
853 {
854 int nThreads = blockDim.x * gridDim.x;
855 int startID = blockIdx.x * blockDim.x + threadIdx.x;
856 for (int i = startID; i < n; i += nThreads) {
857 a[i].x -= b;
858 }
859 }
860
861 /*
862 * Vector multiplication in-place, a[i] *= b[i], GPU kernel (cudaReal).
863 *
864 * \param a output array (LHS)
865 * \param b input array (RHS)
866 * \param n size of arrays
867 */
868 __global__ void _mulEqV(cudaReal* a, cudaReal const * b, const int n)
869 {
870 int nThreads = blockDim.x * gridDim.x;
871 int startID = blockIdx.x * blockDim.x + threadIdx.x;
872 for (int i = startID; i < n; i += nThreads) {
873 a[i] *= b[i];
874 }
875 }
876
877 /*
878 * Vector multiplication in-place, a[i] *= b[i], GPU kernel (cudaComplex).
879 *
880 * \param a output array (LHS)
881 * \param b input array (RHS)
882 * \param n size of arrays
883 */
884 __global__ void _mulEqV(cudaComplex* a, cudaComplex const * b, const int n)
885 {
886 int nThreads = blockDim.x * gridDim.x;
887 int startID = blockIdx.x * blockDim.x + threadIdx.x;
888 cudaComplex c;
889 for (int i = startID; i < n; i += nThreads) {
890 c.x = (a[i].x * b[i].x) - (a[i].y * b[i].y);
891 c.y = (a[i].x * b[i].y) + (a[i].y * b[i].x);
892 a[i].x = c.x;
893 a[i].y = c.y;
894 }
895 }
896
897 /*
898 * Vector multiplication in-place, a[i]*=b[i], GPU kernel (mixed, b = real).
899 *
900 * \param a output array (LHS)
901 * \param b input array (RHS)
902 * \param n size of arrays
903 */
904 __global__ void _mulEqV(cudaComplex* a, cudaReal const * b, const int n)
905 {
906 int nThreads = blockDim.x * gridDim.x;
907 int startID = blockIdx.x * blockDim.x + threadIdx.x;
908 for (int i = startID; i < n; i += nThreads) {
909 a[i].x *= b[i];
910 a[i].y *= b[i];
911 }
912 }
913
914 /*
915 * Vector multiplication in-place, a[i] *= b, GPU kernel (cudaReal).
916 *
917 * \param a output array (LHS)
918 * \param b input scalar (RHS)
919 * \param n size of arrays
920 */
921 __global__ void _mulEqS(cudaReal* a, const cudaReal b, const int n)
922 {
923 int nThreads = blockDim.x * gridDim.x;
924 int startID = blockIdx.x * blockDim.x + threadIdx.x;
925 for (int i = startID; i < n; i += nThreads) {
926 a[i] *= b;
927 }
928 }
929
930 /*
931 * Vector multiplication in-place, a[i] *= b, GPU kernel (cudaComplex).
932 *
933 * \param a output array (LHS)
934 * \param b input scalar (RHS)
935 * \param n size of arrays
936 */
937 __global__ void _mulEqS(cudaComplex* a, const cudaComplex b, const int n)
938 {
939 int nThreads = blockDim.x * gridDim.x;
940 int startID = blockIdx.x * blockDim.x + threadIdx.x;
941 cudaComplex c;
942 for (int i = startID; i < n; i += nThreads) {
943 c.x = (a[i].x * b.x) - (a[i].y * b.y);
944 c.y = (a[i].x * b.y) + (a[i].y * b.x);
945 a[i].x = c.x;
946 a[i].y = c.y;
947 }
948 }
949
950 /*
951 * Vector multiplication in-place, a[i] *= b, GPU kernel (mixed, b = real).
952 *
953 * \param a output array (LHS)
954 * \param b input scalar (RHS)
955 * \param n size of arrays
956 */
957 __global__ void _mulEqS(cudaComplex* a, const cudaReal b, const int n)
958 {
959 int nThreads = blockDim.x * gridDim.x;
960 int startID = blockIdx.x * blockDim.x + threadIdx.x;
961 for (int i = startID; i < n; i += nThreads) {
962 a[i].x *= b;
963 a[i].y *= b;
964 }
965 }
966
967 /*
968 * Vector division in-place, a[i] /= b[i], GPU kernel (cudaReal).
969 *
970 * \param a output array (LHS)
971 * \param b input array (RHS)
972 * \param n size of arrays
973 */
974 __global__ void _divEqV(cudaReal* a, cudaReal const * b, const int n)
975 {
976 int nThreads = blockDim.x * gridDim.x;
977 int startID = blockIdx.x * blockDim.x + threadIdx.x;
978 for (int i = startID; i < n; i += nThreads) {
979 a[i] /= b[i];
980 }
981 }
982
983 /*
984 * Vector division in-place, a[i] /= b[i], GPU kernel (mixed, b = real).
985 *
986 * \param a output array (LHS)
987 * \param b input array (RHS)
988 * \param n size of arrays
989 */
990 __global__ void _divEqV(cudaComplex* a, cudaReal const * b, const int n)
991 {
992 int nThreads = blockDim.x * gridDim.x;
993 int startID = blockIdx.x * blockDim.x + threadIdx.x;
994 for (int i = startID; i < n; i += nThreads) {
995 a[i].x /= b[i];
996 a[i].y /= b[i];
997 }
998 }
999
1000 /*
1001 * Vector division in-place, a[i] /= b, GPU kernel (cudaReal).
1002 *
1003 * \param a output array (LHS)
1004 * \param b input scalar (RHS)
1005 * \param n size of arrays
1006 */
1007 __global__ void _divEqS(cudaReal* a, const cudaReal b, const int n)
1008 {
1009 int nThreads = blockDim.x * gridDim.x;
1010 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1011 for (int i = startID; i < n; i += nThreads) {
1012 a[i] /= b;
1013 }
1014 }
1015
1016 /*
1017 * Vector division in-place, a[i] /= b, GPU kernel (mixed, b = real).
1018 *
1019 * \param a output array (LHS)
1020 * \param b input scalar (RHS)
1021 * \param n size of arrays
1022 */
1023 __global__ void _divEqS(cudaComplex* a, const cudaReal b, const int n)
1024 {
1025 int nThreads = blockDim.x * gridDim.x;
1026 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1027 for (int i = startID; i < n; i += nThreads) {
1028 a[i].x /= b;
1029 a[i].y /= b;
1030 }
1031 }
1032
1033} // end anonymous namespace
1034
1035
1036// CUDA kernel wrappers:
1037
1038// Vector assignment, a[i] = b[i], kernel wrapper (cudaReal).
1040 const int beginIdA, const int beginIdB, const int n)
1041{
1042 UTIL_CHECK(a.capacity() >= n + beginIdA);
1043 UTIL_CHECK(b.capacity() >= n + beginIdB);
1044
1045 // GPU resources
1046 int nBlocks, nThreads;
1047 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1048
1049 // Launch kernel
1050 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1051 b.cArray()+beginIdB, n);
1052 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1053}
1054
1055// Vector assignment, a[i] = b[i], kernel wrapper (cudaComplex).
1057 const int beginIdA, const int beginIdB, const int n)
1058{
1059 UTIL_CHECK(a.capacity() >= n + beginIdA);
1060 UTIL_CHECK(b.capacity() >= n + beginIdB);
1061
1062 // GPU resources
1063 int nBlocks, nThreads;
1064 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1065
1066 // Launch kernel
1067 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1068 b.cArray()+beginIdB, n);
1069 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1070}
1071
1072// Vector assignment, a[i] = b, kernel wrapper (cudaReal).
1073void eqS(DeviceArray<cudaReal>& a, const cudaReal b,
1074 const int beginIdA, const int n)
1075{
1076 UTIL_CHECK(a.capacity() >= n + beginIdA);
1077
1078 // GPU resources
1079 int nBlocks, nThreads;
1080 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1081
1082 // Launch kernel
1083 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1084 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1085}
1086
1087// Vector assignment, a[i] = b, kernel wrapper (cudaComplex).
1088void eqS(DeviceArray<cudaComplex>& a, const cudaComplex b,
1089 const int beginIdA, const int n)
1090{
1091 UTIL_CHECK(a.capacity() >= n + beginIdA);
1092
1093 // GPU resources
1094 int nBlocks, nThreads;
1095 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1096
1097 // Launch kernel
1098 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1099 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1100}
1101
1102// Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
1104 DeviceArray<cudaReal> const & c, const int beginIdA,
1105 const int beginIdB, const int beginIdC, const int n)
1106{
1107 UTIL_CHECK(a.capacity() >= n + beginIdA);
1108 UTIL_CHECK(b.capacity() >= n + beginIdB);
1109 UTIL_CHECK(c.capacity() >= n + beginIdC);
1110
1111 // GPU resources
1112 int nBlocks, nThreads;
1113 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1114
1115 // Launch kernel
1116 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1117 c.cArray()+beginIdC, n);
1118 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1119}
1120
1121// Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaComplex).
1123 DeviceArray<cudaComplex> const & c, const int beginIdA,
1124 const int beginIdB, const int beginIdC, const int n)
1125{
1126 UTIL_CHECK(a.capacity() >= n + beginIdA);
1127 UTIL_CHECK(b.capacity() >= n + beginIdB);
1128 UTIL_CHECK(c.capacity() >= n + beginIdC);
1129
1130 // GPU resources
1131 int nBlocks, nThreads;
1132 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1133
1134 // Launch kernel
1135 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1136 c.cArray()+beginIdC, n);
1137 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1138}
1139
1140// Vector addition, a[i] = b[i] + c[i], kernel wrapper (mixed, b = real).
1142 DeviceArray<cudaComplex> const & c, const int beginIdA,
1143 const int beginIdB, const int beginIdC, const int n)
1144{
1145 UTIL_CHECK(a.capacity() >= n + beginIdA);
1146 UTIL_CHECK(b.capacity() >= n + beginIdB);
1147 UTIL_CHECK(c.capacity() >= n + beginIdC);
1148
1149 // GPU resources
1150 int nBlocks, nThreads;
1151 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1152
1153 // Launch kernel
1154 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1155 c.cArray()+beginIdC, n);
1156 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1157}
1158
1159// Vector addition, a[i] = b[i] + c[i], kernel wrapper (mixed, c = real).
1161 DeviceArray<cudaReal> const & c, const int beginIdA,
1162 const int beginIdB, const int beginIdC, const int n)
1163{
1164 UTIL_CHECK(a.capacity() >= n + beginIdA);
1165 UTIL_CHECK(b.capacity() >= n + beginIdB);
1166 UTIL_CHECK(c.capacity() >= n + beginIdC);
1167
1168 // GPU resources
1169 int nBlocks, nThreads;
1170 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1171
1172 // Launch kernel
1173 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1174 c.cArray()+beginIdC, n);
1175 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1176}
1177
1178// Vector addition, a[i] = b[i] + c, kernel wrapper (cudaReal).
1180 const cudaReal c, const int beginIdA, const int beginIdB,int n)
1181{
1182 UTIL_CHECK(a.capacity() >= n + beginIdA);
1183 UTIL_CHECK(b.capacity() >= n + beginIdB);
1184
1185 // GPU resources
1186 int nBlocks, nThreads;
1187 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1188
1189 // Launch kernel
1190 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1191 c, n);
1192 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1193}
1194
1195// Vector addition, a[i] = b[i] + c, kernel wrapper (cudaComplex).
1197 const cudaComplex c, const int beginIdA, const int beginIdB,int n)
1198{
1199 UTIL_CHECK(a.capacity() >= n + beginIdA);
1200 UTIL_CHECK(b.capacity() >= n + beginIdB);
1201
1202 // GPU resources
1203 int nBlocks, nThreads;
1204 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1205
1206 // Launch kernel
1207 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1208 c, n);
1209 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1210}
1211
1212// Vector addition, a[i] = b[i] + c, kernel wrapper (mixed, b = real).
1214 const cudaComplex c, const int beginIdA, const int beginIdB,int n)
1215{
1216 UTIL_CHECK(a.capacity() >= n + beginIdA);
1217 UTIL_CHECK(b.capacity() >= n + beginIdB);
1218
1219 // GPU resources
1220 int nBlocks, nThreads;
1221 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1222
1223 // Launch kernel
1224 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1225 c, n);
1226 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1227}
1228
1229// Vector addition, a[i] = b[i] + c, kernel wrapper (mixed, c = real).
1231 const cudaReal c, const int beginIdA, const int beginIdB,int n)
1232{
1233 UTIL_CHECK(a.capacity() >= n + beginIdA);
1234 UTIL_CHECK(b.capacity() >= n + beginIdB);
1235
1236 // GPU resources
1237 int nBlocks, nThreads;
1238 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1239
1240 // Launch kernel
1241 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1242 c, n);
1243 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1244}
1245
1246// Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaReal).
1248 DeviceArray<cudaReal> const & c, const int beginIdA,
1249 const int beginIdB, const int beginIdC, const int n)
1250{
1251 UTIL_CHECK(a.capacity() >= n + beginIdA);
1252 UTIL_CHECK(b.capacity() >= n + beginIdB);
1253 UTIL_CHECK(c.capacity() >= n + beginIdC);
1254
1255 // GPU resources
1256 int nBlocks, nThreads;
1257 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1258
1259 // Launch kernel
1260 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1261 c.cArray()+beginIdC, n);
1262 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1263}
1264
1265// Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaComplex).
1267 DeviceArray<cudaComplex> const & c, const int beginIdA,
1268 const int beginIdB, const int beginIdC, const int n)
1269{
1270 UTIL_CHECK(a.capacity() >= n + beginIdA);
1271 UTIL_CHECK(b.capacity() >= n + beginIdB);
1272 UTIL_CHECK(c.capacity() >= n + beginIdC);
1273
1274 // GPU resources
1275 int nBlocks, nThreads;
1276 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1277
1278 // Launch kernel
1279 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1280 c.cArray()+beginIdC, n);
1281 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1282}
1283
1284// Vector subtraction, a[i]=b[i]-c[i], kernel wrapper (mixed, b=real).
1286 DeviceArray<cudaComplex> const & c, const int beginIdA,
1287 const int beginIdB, const int beginIdC, const int n)
1288{
1289 UTIL_CHECK(a.capacity() >= n + beginIdA);
1290 UTIL_CHECK(b.capacity() >= n + beginIdB);
1291 UTIL_CHECK(c.capacity() >= n + beginIdC);
1292
1293 // GPU resources
1294 int nBlocks, nThreads;
1295 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1296
1297 // Launch kernel
1298 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1299 c.cArray()+beginIdC, n);
1300 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1301}
1302
1303// Vector subtraction, a[i]=b[i]-c[i], kernel wrapper (mixed, c=real).
1305 DeviceArray<cudaReal> const & c, const int beginIdA,
1306 const int beginIdB, const int beginIdC, const int n)
1307{
1308 UTIL_CHECK(a.capacity() >= n + beginIdA);
1309 UTIL_CHECK(b.capacity() >= n + beginIdB);
1310 UTIL_CHECK(c.capacity() >= n + beginIdC);
1311
1312 // GPU resources
1313 int nBlocks, nThreads;
1314 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1315
1316 // Launch kernel
1317 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1318 c.cArray()+beginIdC, n);
1319 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1320}
1321
1322// Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaReal).
1324 DeviceArray<cudaReal> const & b, const cudaReal c,
1325 const int beginIdA, const int beginIdB, const int n)
1326{
1327 UTIL_CHECK(a.capacity() >= n + beginIdA);
1328 UTIL_CHECK(b.capacity() >= n + beginIdB);
1329
1330 // GPU resources
1331 int nBlocks, nThreads;
1332 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1333
1334 // Launch kernel
1335 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1336 c, n);
1337 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1338}
1339
1340// Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaComplex).
1342 const cudaComplex c, const int beginIdA, const int beginIdB,
1343 const int n)
1344{
1345 UTIL_CHECK(a.capacity() >= n + beginIdA);
1346 UTIL_CHECK(b.capacity() >= n + beginIdB);
1347
1348 // GPU resources
1349 int nBlocks, nThreads;
1350 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1351
1352 // Launch kernel
1353 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1354 c, n);
1355 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1356}
1357
1358// Vector subtraction, a[i] = b[i] - c, kernel wrapper (mixed, b = real).
1360 const cudaComplex c, const int beginIdA, const int beginIdB,
1361 const int n)
1362{
1363 UTIL_CHECK(a.capacity() >= n + beginIdA);
1364 UTIL_CHECK(b.capacity() >= n + beginIdB);
1365
1366 // GPU resources
1367 int nBlocks, nThreads;
1368 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1369
1370 // Launch kernel
1371 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1372 c, n);
1373 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1374}
1375
1376// Vector subtraction, a[i] = b[i] - c, kernel wrapper (mixed, c = real).
1378 const cudaReal c, const int beginIdA, const int beginIdB,
1379 const int n)
1380{
1381 UTIL_CHECK(a.capacity() >= n + beginIdA);
1382 UTIL_CHECK(b.capacity() >= n + beginIdB);
1383
1384 // GPU resources
1385 int nBlocks, nThreads;
1386 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1387
1388 // Launch kernel
1389 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1390 c, n);
1391 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1392}
1393
1394// Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
1396 DeviceArray<cudaReal> const & c, const int beginIdA,
1397 const int beginIdB, const int beginIdC, const int n)
1398{
1399 UTIL_CHECK(a.capacity() >= n + beginIdA);
1400 UTIL_CHECK(b.capacity() >= n + beginIdB);
1401 UTIL_CHECK(c.capacity() >= n + beginIdC);
1402
1403 // GPU resources
1404 int nBlocks, nThreads;
1405 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1406
1407 // Launch kernel
1408 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1409 c.cArray()+beginIdC, n);
1410 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1411}
1412
1413// Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaComplex).
1415 DeviceArray<cudaComplex> const & c, const int beginIdA,
1416 const int beginIdB, const int beginIdC, const int n)
1417{
1418 UTIL_CHECK(a.capacity() >= n + beginIdA);
1419 UTIL_CHECK(b.capacity() >= n + beginIdB);
1420 UTIL_CHECK(c.capacity() >= n + beginIdC);
1421
1422 // GPU resources
1423 int nBlocks, nThreads;
1424 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1425
1426 // Launch kernel
1427 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1428 c.cArray()+beginIdC, n);
1429 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1430}
1431
1432// Vector multiplication, a[i]=b[i]*c[i], kernel wrapper (mixed, b = real).
1434 DeviceArray<cudaComplex> const & c, const int beginIdA,
1435 const int beginIdB, const int beginIdC, const int n)
1436{
1437 UTIL_CHECK(a.capacity() >= n + beginIdA);
1438 UTIL_CHECK(b.capacity() >= n + beginIdB);
1439 UTIL_CHECK(c.capacity() >= n + beginIdC);
1440
1441 // GPU resources
1442 int nBlocks, nThreads;
1443 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1444
1445 // Launch kernel
1446 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1447 c.cArray()+beginIdC, n);
1448 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1449}
1450
1451// Vector multiplication, a[i]=b[i]*c[i], kernel wrapper (mixed, c = real).
1453 DeviceArray<cudaReal> const & c, const int beginIdA,
1454 const int beginIdB, const int beginIdC, const int n)
1455{
1456 UTIL_CHECK(a.capacity() >= n + beginIdA);
1457 UTIL_CHECK(b.capacity() >= n + beginIdB);
1458 UTIL_CHECK(c.capacity() >= n + beginIdC);
1459
1460 // GPU resources
1461 int nBlocks, nThreads;
1462 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1463
1464 // Launch kernel
1465 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1466 c.cArray()+beginIdC, n);
1467 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1468}
1469
1470// Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaReal).
1472 const cudaReal c, const int beginIdA, const int beginIdB,
1473 const int n)
1474{
1475 UTIL_CHECK(a.capacity() >= n + beginIdA);
1476 UTIL_CHECK(b.capacity() >= n + beginIdB);
1477
1478 // GPU resources
1479 int nBlocks, nThreads;
1480 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1481
1482 // Launch kernel
1483 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1484 c, n);
1485 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1486}
1487
1488// Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaComplex).
1490 const cudaComplex c, const int beginIdA, const int beginIdB,
1491 const int n)
1492{
1493 UTIL_CHECK(a.capacity() >= n + beginIdA);
1494 UTIL_CHECK(b.capacity() >= n + beginIdB);
1495
1496 // GPU resources
1497 int nBlocks, nThreads;
1498 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1499
1500 // Launch kernel
1501 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1502 c, n);
1503 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1504}
1505
1506// Vector multiplication, a[i] = b[i] * c, kernel wrapper (mixed, b = real).
1508 DeviceArray<cudaReal> const & b,
1509 const cudaComplex c,
1510 const int beginIdA, const int beginIdB, const int n)
1511{
1512 UTIL_CHECK(a.capacity() >= n + beginIdA);
1513 UTIL_CHECK(b.capacity() >= n + beginIdB);
1514
1515 // GPU resources
1516 int nBlocks, nThreads;
1517 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1518
1519 // Launch kernel
1520 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1521 c, n);
1522 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1523}
1524
1525// Vector multiplication, a[i] = b[i] * c, kernel wrapper (mixed, c = real).
1527 const cudaReal c, const int beginIdA, const int beginIdB,
1528 const int n)
1529{
1530 UTIL_CHECK(a.capacity() >= n + beginIdA);
1531 UTIL_CHECK(b.capacity() >= n + beginIdB);
1532
1533 // GPU resources
1534 int nBlocks, nThreads;
1535 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1536
1537 // Launch kernel
1538 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1539 c, n);
1540 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1541}
1542
1543// Vector division, a[i] = b[i] / c[i], kernel wrapper (cudaReal).
1545 DeviceArray<cudaReal> const & c, const int beginIdA,
1546 const int beginIdB, const int beginIdC, const int n)
1547{
1548 UTIL_CHECK(a.capacity() >= n + beginIdA);
1549 UTIL_CHECK(b.capacity() >= n + beginIdB);
1550 UTIL_CHECK(c.capacity() >= n + beginIdC);
1551
1552 // GPU resources
1553 int nBlocks, nThreads;
1554 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1555
1556 // Launch kernel
1557 _divVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1558 c.cArray()+beginIdC, n);
1559 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1560}
1561
1562// Vector division, a[i] = b[i] / c[i], kernel wrapper (mixed, c = real).
1564 DeviceArray<cudaReal> const & c, const int beginIdA,
1565 const int beginIdB, const int beginIdC, const int n)
1566{
1567 UTIL_CHECK(a.capacity() >= n + beginIdA);
1568 UTIL_CHECK(b.capacity() >= n + beginIdB);
1569 UTIL_CHECK(c.capacity() >= n + beginIdC);
1570
1571 // GPU resources
1572 int nBlocks, nThreads;
1573 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1574
1575 // Launch kernel
1576 _divVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1577 c.cArray()+beginIdC, n);
1578 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1579}
1580
1581// Vector division, a[i] = b[i] / c, kernel wrapper (cudaReal).
1583 const cudaReal c, const int beginIdA, const int beginIdB,
1584 const int n)
1585{
1586 UTIL_CHECK(a.capacity() >= n + beginIdA);
1587 UTIL_CHECK(b.capacity() >= n + beginIdB);
1588
1589 // GPU resources
1590 int nBlocks, nThreads;
1591 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1592
1593 // Launch kernel
1594 _divVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1595 c, n);
1596 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1597}
1598
1599// Vector division, a[i] = b[i] / c, kernel wrapper (mixed, c = real).
1601 DeviceArray<cudaComplex> const & b,
1602 const cudaReal c, const int beginIdA, const int beginIdB,
1603 const int n)
1604{
1605 UTIL_CHECK(a.capacity() >= n + beginIdA);
1606 UTIL_CHECK(b.capacity() >= n + beginIdB);
1607
1608 // GPU resources
1609 int nBlocks, nThreads;
1610 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1611
1612 // Launch kernel
1613 _divVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1614 c, n);
1615 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1616}
1617
1618// Division of scalar by vector, a[i] = b / c[i], kernel wrapper (cudaReal).
1620 const cudaReal b,
1621 DeviceArray<cudaReal> const & c,
1622 const int beginIdA, const int beginIdC, const int n)
1623{
1624 UTIL_CHECK(a.capacity() >= n + beginIdA);
1625 UTIL_CHECK(c.capacity() >= n + beginIdC);
1626
1627 // GPU resources
1628 int nBlocks, nThreads;
1629 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1630
1631 // Launch kernel
1632 _divSV<<<nBlocks, nThreads>>>(a.cArray() + beginIdA, b,
1633 c.cArray() + beginIdC, n);
1634 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1635}
1636
1637// Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaReal).
1639 const int beginIdA, const int beginIdB, const int n)
1640{
1641 UTIL_CHECK(a.capacity() >= n + beginIdA);
1642 UTIL_CHECK(b.capacity() >= n + beginIdB);
1643
1644 // GPU resources
1645 int nBlocks, nThreads;
1646 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1647
1648 // Launch kernel
1649 _expV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1650 b.cArray()+beginIdB, n);
1651 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1652}
1653
1654// Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaComplex).
1656 const int beginIdA, const int beginIdB, const int n)
1657{
1658 UTIL_CHECK(a.capacity() >= n + beginIdA);
1659 UTIL_CHECK(b.capacity() >= n + beginIdB);
1660
1661 // GPU resources
1662 int nBlocks, nThreads;
1663 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1664
1665 // Launch kernel
1666 _expV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1667 b.cArray()+beginIdB, n);
1668 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1669}
1670
1671// Vector addition in-place, a[i] += b[i], kernel wrapper (cudaReal).
1673 const int beginIdA, const int beginIdB, const int n)
1674{
1675 UTIL_CHECK(a.capacity() >= n + beginIdA);
1676 UTIL_CHECK(b.capacity() >= n + beginIdB);
1677
1678 // GPU resources
1679 int nBlocks, nThreads;
1680 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1681
1682 // Launch kernel
1683 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1684 b.cArray()+beginIdB, n);
1685 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1686}
1687
1688// Vector addition in-place, a[i] += b[i], kernel wrapper (cudaComplex).
1690 DeviceArray<cudaComplex> const & b,
1691 const int beginIdA, const int beginIdB, const int n)
1692{
1693 UTIL_CHECK(a.capacity() >= n + beginIdA);
1694 UTIL_CHECK(b.capacity() >= n + beginIdB);
1695
1696 // GPU resources
1697 int nBlocks, nThreads;
1698 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1699
1700 // Launch kernel
1701 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1702 b.cArray()+beginIdB, n);
1703 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1704}
1705
1706// Vector addition in-place, a[i] += b[i], kernel wrapper (mixed).
1708 const int beginIdA, const int beginIdB, const int n)
1709{
1710 UTIL_CHECK(a.capacity() >= n + beginIdA);
1711 UTIL_CHECK(b.capacity() >= n + beginIdB);
1712
1713 // GPU resources
1714 int nBlocks, nThreads;
1715 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1716
1717 // Launch kernel
1718 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1719 b.cArray()+beginIdB, n);
1720 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1721}
1722
1723// Vector addition in-place, a[i] += b, kernel wrapper (cudaReal).
1724void addEqS(DeviceArray<cudaReal>& a, const cudaReal b,
1725 const int beginIdA, const int n)
1726{
1727 UTIL_CHECK(a.capacity() >= n + beginIdA);
1728
1729 // GPU resources
1730 int nBlocks, nThreads;
1731 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1732
1733 // Launch kernel
1734 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1735 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1736}
1737
1738// Vector addition in-place, a[i] += b, kernel wrapper (cudaComplex).
1739void addEqS(DeviceArray<cudaComplex>& a, const cudaComplex b,
1740 const int beginIdA, const int n)
1741{
1742 UTIL_CHECK(a.capacity() >= n + beginIdA);
1743
1744 // GPU resources
1745 int nBlocks, nThreads;
1746 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1747
1748 // Launch kernel
1749 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1750 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1751}
1752
1753// Vector addition in-place, a[i] += b, kernel wrapper (mixed).
1754void addEqS(DeviceArray<cudaComplex>& a, const cudaReal b,
1755 const int beginIdA, const int n)
1756{
1757 UTIL_CHECK(a.capacity() >= n + beginIdA);
1758
1759 // GPU resources
1760 int nBlocks, nThreads;
1761 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1762
1763 // Launch kernel
1764 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1765 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1766}
1767
1768// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaReal).
1770 DeviceArray<cudaReal> const & b,
1771 const int beginIdA, const int beginIdB, const int n)
1772{
1773 UTIL_CHECK(a.capacity() >= n + beginIdA);
1774 UTIL_CHECK(b.capacity() >= n + beginIdB);
1775
1776 // GPU resources
1777 int nBlocks, nThreads;
1778 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1779
1780 // Launch kernel
1781 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1782 b.cArray()+beginIdB, n);
1783 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1784}
1785
1786// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaComplex).
1788 DeviceArray<cudaComplex> const & b,
1789 const int beginIdA, const int beginIdB, const int n)
1790{
1791 UTIL_CHECK(a.capacity() >= n + beginIdA);
1792 UTIL_CHECK(b.capacity() >= n + beginIdB);
1793
1794 // GPU resources
1795 int nBlocks, nThreads;
1796 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1797
1798 // Launch kernel
1799 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1800 b.cArray()+beginIdB, n);
1801 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1802}
1803
1804// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (mixed).
1806 const int beginIdA, const int beginIdB, const int n)
1807{
1808 UTIL_CHECK(a.capacity() >= n + beginIdA);
1809 UTIL_CHECK(b.capacity() >= n + beginIdB);
1810
1811 // GPU resources
1812 int nBlocks, nThreads;
1813 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1814
1815 // Launch kernel
1816 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1817 b.cArray()+beginIdB, n);
1818 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1819}
1820
1821// Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaReal).
1822void subEqS(DeviceArray<cudaReal>& a, const cudaReal b,
1823 const int beginIdA, const int n)
1824{
1825 UTIL_CHECK(a.capacity() >= n + beginIdA);
1826
1827 // GPU resources
1828 int nBlocks, nThreads;
1829 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1830
1831 // Launch kernel
1832 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1833 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1834}
1835
1836// Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaComplex).
1837void subEqS(DeviceArray<cudaComplex>& a, const cudaComplex b,
1838 const int beginIdA, const int n)
1839{
1840 UTIL_CHECK(a.capacity() >= n + beginIdA);
1841
1842 // GPU resources
1843 int nBlocks, nThreads;
1844 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1845
1846 // Launch kernel
1847 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1848 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1849}
1850
1851// Vector subtraction in-place, a[i] -= b, kernel wrapper (mixed).
1852void subEqS(DeviceArray<cudaComplex>& a, const cudaReal b,
1853 const int beginIdA, const int n)
1854{
1855 UTIL_CHECK(a.capacity() >= n + beginIdA);
1856
1857 // GPU resources
1858 int nBlocks, nThreads;
1859 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1860
1861 // Launch kernel
1862 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1863 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1864}
1865
1866// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaReal).
1868 const int beginIdA, const int beginIdB, const int n)
1869{
1870 UTIL_CHECK(a.capacity() >= n + beginIdA);
1871 UTIL_CHECK(b.capacity() >= n + beginIdB);
1872
1873 // GPU resources
1874 int nBlocks, nThreads;
1875 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1876
1877 // Launch kernel
1878 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1879 b.cArray()+beginIdB, n);
1880 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1881}
1882
1883// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaComplex).
1885 const int beginIdA, const int beginIdB, const int n)
1886{
1887 UTIL_CHECK(a.capacity() >= n + beginIdA);
1888 UTIL_CHECK(b.capacity() >= n + beginIdB);
1889
1890 // GPU resources
1891 int nBlocks, nThreads;
1892 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1893
1894 // Launch kernel
1895 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1896 b.cArray()+beginIdB, n);
1897 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1898}
1899
1900// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (mixed).
1902 const int beginIdA, const int beginIdB, const int n)
1903{
1904 UTIL_CHECK(a.capacity() >= n + beginIdA);
1905 UTIL_CHECK(b.capacity() >= n + beginIdB);
1906
1907 // GPU resources
1908 int nBlocks, nThreads;
1909 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1910
1911 // Launch kernel
1912 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1913 b.cArray()+beginIdB, n);
1914 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1915}
1916
1917// Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaReal).
1918void mulEqS(DeviceArray<cudaReal>& a, const cudaReal b,
1919 const int beginIdA, const int n)
1920{
1921 UTIL_CHECK(a.capacity() >= n + beginIdA);
1922
1923 // GPU resources
1924 int nBlocks, nThreads;
1925 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1926
1927 // Launch kernel
1928 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1929 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1930}
1931
1932// Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaComplex).
1934 const cudaComplex b,
1935 const int beginIdA, const int n)
1936{
1937 UTIL_CHECK(a.capacity() >= n + beginIdA);
1938
1939 // GPU resources
1940 int nBlocks, nThreads;
1941 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1942
1943 // Launch kernel
1944 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1945 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1946}
1947
1948// Vector multiplication in-place, a[i] *= b, kernel wrapper (mixed).
1949void mulEqS(DeviceArray<cudaComplex>& a, const cudaReal b,
1950 const int beginIdA, const int n)
1951{
1952 UTIL_CHECK(a.capacity() >= n + beginIdA);
1953
1954 // GPU resources
1955 int nBlocks, nThreads;
1956 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1957
1958 // Launch kernel
1959 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1960 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1961}
1962
1963// Vector division in-place, a[i] /= b[i], kernel wrapper (cudaReal).
1965 const int beginIdA, const int beginIdB, const int n)
1966{
1967 UTIL_CHECK(a.capacity() >= n + beginIdA);
1968 UTIL_CHECK(b.capacity() >= n + beginIdB);
1969
1970 // GPU resources
1971 int nBlocks, nThreads;
1972 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1973
1974 // Launch kernel
1975 _divEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1976 b.cArray()+beginIdB, n);
1977 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1978}
1979
1980// Vector division in-place, a[i] /= b[i], kernel wrapper (mixed).
1982 const int beginIdA, const int beginIdB, const int n)
1983{
1984 UTIL_CHECK(a.capacity() >= n + beginIdA);
1985 UTIL_CHECK(b.capacity() >= n + beginIdB);
1986
1987 // GPU resources
1988 int nBlocks, nThreads;
1989 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1990
1991 // Launch kernel
1992 _divEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1993 b.cArray()+beginIdB, n);
1994 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1995}
1996
1997// Vector division in-place, a[i] /= b, kernel wrapper (cudaReal).
1998void divEqS(DeviceArray<cudaReal>& a, const cudaReal b,
1999 const int beginIdA, const int n)
2000{
2001 UTIL_CHECK(a.capacity() >= n + beginIdA);
2002
2003 // GPU resources
2004 int nBlocks, nThreads;
2005 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2006
2007 // Launch kernel
2008 _divEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2009 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
2010}
2011
2012// Vector division in-place, a[i] /= b, kernel wrapper (mixed).
2013void divEqS(DeviceArray<cudaComplex>& a, const cudaReal b,
2014 const int beginIdA, const int n)
2015{
2016 UTIL_CHECK(a.capacity() >= n + beginIdA);
2017
2018 // GPU resources
2019 int nBlocks, nThreads;
2020 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
2021
2022 // Launch kernel
2023 _divEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
2024 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
2025}
2026
2027} // namespace VecOp
2028} // namespace Cuda
2029} // namespace Prdc
2030} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition DeviceArray.h:43
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
Functions that perform element-wise vector operations on the GPU.
Definition VecOp.cu:16
void addEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector addition in-place, a[i] += b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1672
void eqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector assignment, a[i] = b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1039
void addVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const cudaReal c, const int beginIdA, const int beginIdB, int n)
Vector addition, a[i] = b[i] + c, kernel wrapper (cudaReal).
Definition VecOp.cu:1179
void eqS(DeviceArray< cudaReal > &a, const cudaReal b, const int beginIdA, const int n)
Vector assignment, a[i] = b, kernel wrapper (cudaReal).
Definition VecOp.cu:1073
void addEqS(DeviceArray< cudaReal > &a, const cudaReal b, const int beginIdA, const int n)
Vector addition in-place, a[i] += b, kernel wrapper (cudaReal).
Definition VecOp.cu:1724
void divVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const cudaReal c, const int beginIdA, const int beginIdB, const int n)
Vector division, a[i] = b[i] / c, kernel wrapper (cudaReal).
Definition VecOp.cu:1582
void divEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector division in-place, a[i] /= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1964
void subEqS(DeviceArray< cudaReal > &a, const cudaReal b, const int beginIdA, const int n)
Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1822
void divVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector division, a[i] = b[i] / c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1544
void addVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1103
void mulVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1395
void subVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1247
void mulVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const cudaReal c, const int beginIdA, const int beginIdB, const int n)
Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaReal).
Definition VecOp.cu:1471
void mulEqS(DeviceArray< cudaReal > &a, const cudaReal b, const int beginIdA, const int n)
Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1918
void mulEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1867
void subEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1769
void divEqS(DeviceArray< cudaReal > &a, const cudaReal b, const int beginIdA, const int n)
Vector division in-place, a[i] /= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1998
void subVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const cudaReal c, const int beginIdA, const int beginIdB, const int n)
Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaReal).
Definition VecOp.cu:1323
void expV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaReal).
Definition VecOp.cu:1638
void divSV(DeviceArray< cudaReal > &a, const cudaReal b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdC, const int n)
Vector division, a[i] = b / c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1619
Fields, FFTs, and utilities for periodic boundary conditions (CUDA)
Definition Reduce.cpp:14
Periodic fields and crystallography.
Definition CField.cpp:11
PSCF package top-level namespace.
Definition param_pc.dox:1