PSCF v1.2
VecOp.cu
1/*
2* PSCF Package
3*
4* Copyright 2016 - 2022, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "VecOp.h"
9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/cudaErrorCheck.h>
11#include <cmath>
12
13namespace Pscf {
14namespace Prdc {
15namespace Cuda {
16namespace VecOp {
17
18// CUDA kernels:
19// (defined in anonymous namespace, used within this file only)
20
21namespace {
22
23 /*
24 * Vector assignment, a[i] = b[i], GPU kernel (cudaReal).
25 *
26 * \param a output array (LHS)
27 * \param b input array (RHS)
28 * \param n size of arrays
29 */
30 __global__ void _eqV(cudaReal* a, cudaReal const * b, const int n)
31 {
32 int nThreads = blockDim.x * gridDim.x;
33 int startID = blockIdx.x * blockDim.x + threadIdx.x;
34 for(int i = startID; i < n; i += nThreads) {
35 a[i] = b[i];
36 }
37 }
38
39 /*
40 * Vector assignment, a[i] = b[i], GPU kernel (cudaComplex).
41 *
42 * \param a output array (LHS)
43 * \param b input array (RHS)
44 * \param n size of arrays
45 */
46 __global__ void _eqV(cudaComplex* a, cudaComplex const * b, const int n)
47 {
48 int nThreads = blockDim.x * gridDim.x;
49 int startID = blockIdx.x * blockDim.x + threadIdx.x;
50 for(int i = startID; i < n; i += nThreads) {
51 a[i].x = b[i].x;
52 a[i].y = b[i].y;
53 }
54 }
55
56 /*
57 * Vector assignment, a[i] = b, GPU kernel (cudaReal).
58 *
59 * \param a output array (LHS)
60 * \param b input scalar (RHS)
61 * \param n size of arrays
62 */
63 __global__ void _eqS(cudaReal* a, cudaReal const b, const int n)
64 {
65 int nThreads = blockDim.x * gridDim.x;
66 int startID = blockIdx.x * blockDim.x + threadIdx.x;
67 for(int i = startID; i < n; i += nThreads) {
68 a[i] = b;
69 }
70 }
71
72 /*
73 * Vector assignment, a[i] = b, GPU kernel (cudaComplex).
74 *
75 * \param a output array (LHS)
76 * \param b input scalar (RHS)
77 * \param n size of arrays
78 */
79 __global__ void _eqS(cudaComplex* a, cudaComplex const b, const int n)
80 {
81 int nThreads = blockDim.x * gridDim.x;
82 int startID = blockIdx.x * blockDim.x + threadIdx.x;
83 for(int i = startID; i < n; i += nThreads) {
84 a[i].x = b.x;
85 a[i].y = b.y;
86 }
87 }
88
89 /*
90 * Vector addition, a[i] = b[i] + c[i], GPU kernel (cudaReal).
91 *
92 * \param a output array (LHS)
93 * \param b input array (RHS)
94 * \param c input array (RHS)
95 * \param n size of arrays
96 */
97 __global__ void _addVV(cudaReal* a, cudaReal const * b,
98 cudaReal const * c, const int n)
99 {
100 int nThreads = blockDim.x * gridDim.x;
101 int startID = blockIdx.x * blockDim.x + threadIdx.x;
102 for(int i = startID; i < n; i += nThreads) {
103 a[i] = b[i] + c[i];
104 }
105 }
106
107 /*
108 * Vector addition, a[i] = b[i] + c[i], GPU kernel (cudaComplex).
109 *
110 * \param a output array (LHS)
111 * \param b input array (RHS)
112 * \param c input array (RHS)
113 * \param n size of arrays
114 */
115 __global__ void _addVV(cudaComplex* a, cudaComplex const * b,
116 cudaComplex const * c, const int n)
117 {
118 int nThreads = blockDim.x * gridDim.x;
119 int startID = blockIdx.x * blockDim.x + threadIdx.x;
120 for(int i = startID; i < n; i += nThreads) {
121 a[i].x = b[i].x + c[i].x;
122 a[i].y = b[i].y + c[i].y;
123 }
124 }
125
126 /*
127 * Vector addition, a[i] = b[i] + c[i], GPU kernel (mixed, b = real).
128 *
129 * \param a output array (LHS)
130 * \param b input array (RHS)
131 * \param c input array (RHS)
132 * \param n size of arrays
133 */
134 __global__ void _addVV(cudaComplex* a, cudaReal const * b,
135 cudaComplex const * c, const int n)
136 {
137 int nThreads = blockDim.x * gridDim.x;
138 int startID = blockIdx.x * blockDim.x + threadIdx.x;
139 for(int i = startID; i < n; i += nThreads) {
140 a[i].x = b[i] + c[i].x;
141 a[i].y = c[i].y;
142 }
143 }
144
145 /*
146 * Vector addition, a[i] = b[i] + c[i], GPU kernel (mixed, c = real).
147 *
148 * \param a output array (LHS)
149 * \param b input array (RHS)
150 * \param c input array (RHS)
151 * \param n size of arrays
152 */
153 __global__ void _addVV(cudaComplex* a, cudaComplex const * b,
154 cudaReal const * c, const int n)
155 {
156 int nThreads = blockDim.x * gridDim.x;
157 int startID = blockIdx.x * blockDim.x + threadIdx.x;
158 for(int i = startID; i < n; i += nThreads) {
159 a[i].x = b[i].x + c[i];
160 a[i].y = b[i].y;
161 }
162 }
163
164 /*
165 * Vector addition, a[i] = b[i] + c, GPU kernel (cudaReal).
166 *
167 * \param a output array (LHS)
168 * \param b input array (RHS)
169 * \param c input scalar (RHS)
170 * \param n size of arrays
171 */
172 __global__ void _addVS(cudaReal* a, cudaReal const * b,
173 cudaReal const c, const int n)
174 {
175 int nThreads = blockDim.x * gridDim.x;
176 int startID = blockIdx.x * blockDim.x + threadIdx.x;
177 for(int i = startID; i < n; i += nThreads) {
178 a[i] = b[i] + c;
179 }
180 }
181
182 /*
183 * Vector addition, a[i] = b[i] + c, GPU kernel (cudaComplex).
184 *
185 * \param a output array (LHS)
186 * \param b input array (RHS)
187 * \param c input scalar (RHS)
188 * \param n size of arrays
189 */
190 __global__ void _addVS(cudaComplex* a, cudaComplex const * b,
191 cudaComplex const c, const int n)
192 {
193 int nThreads = blockDim.x * gridDim.x;
194 int startID = blockIdx.x * blockDim.x + threadIdx.x;
195 for(int i = startID; i < n; i += nThreads) {
196 a[i].x = b[i].x + c.x;
197 a[i].y = b[i].y + c.y;
198 }
199 }
200
201 /*
202 * Vector addition, a[i] = b[i] + c, GPU kernel (mixed, b = real).
203 *
204 * \param a output array (LHS)
205 * \param b input array (RHS)
206 * \param c input scalar (RHS)
207 * \param n size of arrays
208 */
209 __global__ void _addVS(cudaComplex* a, cudaReal const * b,
210 cudaComplex const c, const int n)
211 {
212 int nThreads = blockDim.x * gridDim.x;
213 int startID = blockIdx.x * blockDim.x + threadIdx.x;
214 for(int i = startID; i < n; i += nThreads) {
215 a[i].x = b[i] + c.x;
216 a[i].y = c.y;
217 }
218 }
219
220 /*
221 * Vector addition, a[i] = b[i] + c, GPU kernel (mixed, c = real).
222 *
223 * \param a output array (LHS)
224 * \param b input array (RHS)
225 * \param c input scalar (RHS)
226 * \param n size of arrays
227 */
228 __global__ void _addVS(cudaComplex* a, cudaComplex const * b,
229 cudaReal const c, const int n)
230 {
231 int nThreads = blockDim.x * gridDim.x;
232 int startID = blockIdx.x * blockDim.x + threadIdx.x;
233 for(int i = startID; i < n; i += nThreads) {
234 a[i].x = b[i].x + c;
235 a[i].y = b[i].y;
236 }
237 }
238
239 /*
240 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (cudaReal).
241 *
242 * \param a output array (LHS)
243 * \param b input array (RHS)
244 * \param c input array (RHS)
245 * \param n size of arrays
246 */
247 __global__ void _subVV(cudaReal* a, cudaReal const * b,
248 cudaReal const * c, const int n)
249 {
250 int nThreads = blockDim.x * gridDim.x;
251 int startID = blockIdx.x * blockDim.x + threadIdx.x;
252 for(int i = startID; i < n; i += nThreads) {
253 a[i] = b[i] - c[i];
254 }
255 }
256
257 /*
258 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (cudaComplex).
259 *
260 * \param a output array (LHS)
261 * \param b input array (RHS)
262 * \param c input array (RHS)
263 * \param n size of arrays
264 */
265 __global__ void _subVV(cudaComplex* a, cudaComplex const * b,
266 cudaComplex const * c, const int n)
267 {
268 int nThreads = blockDim.x * gridDim.x;
269 int startID = blockIdx.x * blockDim.x + threadIdx.x;
270 for(int i = startID; i < n; i += nThreads) {
271 a[i].x = b[i].x - c[i].x;
272 a[i].y = b[i].y - c[i].y;
273 }
274 }
275
276 /*
277 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (mixed, b = real).
278 *
279 * \param a output array (LHS)
280 * \param b input array (RHS)
281 * \param c input array (RHS)
282 * \param n size of arrays
283 */
284 __global__ void _subVV(cudaComplex* a, cudaReal const * b,
285 cudaComplex const * c, const int n)
286 {
287 int nThreads = blockDim.x * gridDim.x;
288 int startID = blockIdx.x * blockDim.x + threadIdx.x;
289 for(int i = startID; i < n; i += nThreads) {
290 a[i].x = b[i] - c[i].x;
291 a[i].y = 0.0 - c[i].y;
292 }
293 }
294
295 /*
296 * Vector subtraction, a[i] = b[i] - c[i], GPU kernel (mixed, c = real).
297 *
298 * \param a output array (LHS)
299 * \param b input array (RHS)
300 * \param c input array (RHS)
301 * \param n size of arrays
302 */
303 __global__ void _subVV(cudaComplex* a, cudaComplex const * b,
304 cudaReal const * c, const int n)
305 {
306 int nThreads = blockDim.x * gridDim.x;
307 int startID = blockIdx.x * blockDim.x + threadIdx.x;
308 for(int i = startID; i < n; i += nThreads) {
309 a[i].x = b[i].x - c[i];
310 a[i].y = b[i].y;
311 }
312 }
313
314 /*
315 * Vector subtraction, a[i] = b[i] - c, GPU kernel (cudaReal).
316 *
317 * \param a output array (LHS)
318 * \param b input array (RHS)
319 * \param c input scalar (RHS)
320 * \param n size of arrays
321 */
322 __global__ void _subVS(cudaReal* a, cudaReal const * b,
323 cudaReal const c, const int n)
324 {
325 int nThreads = blockDim.x * gridDim.x;
326 int startID = blockIdx.x * blockDim.x + threadIdx.x;
327 for(int i = startID; i < n; i += nThreads) {
328 a[i] = b[i] - c;
329 }
330 }
331
332 /*
333 * Vector subtraction, a[i] = b[i] - c, GPU kernel (cudaComplex).
334 *
335 * \param a output array (LHS)
336 * \param b input array (RHS)
337 * \param c input scalar (RHS)
338 * \param n size of arrays
339 */
340 __global__ void _subVS(cudaComplex* a, cudaComplex const * b,
341 cudaComplex const c, const int n)
342 {
343 int nThreads = blockDim.x * gridDim.x;
344 int startID = blockIdx.x * blockDim.x + threadIdx.x;
345 for(int i = startID; i < n; i += nThreads) {
346 a[i].x = b[i].x - c.x;
347 a[i].y = b[i].y - c.y;
348 }
349 }
350
351 /*
352 * Vector subtraction, a[i] = b[i] - c, GPU kernel (mixed, b = real).
353 *
354 * \param a output array (LHS)
355 * \param b input array (RHS)
356 * \param c input scalar (RHS)
357 * \param n size of arrays
358 */
359 __global__ void _subVS(cudaComplex* a, cudaReal const * b,
360 cudaComplex const c, const int n)
361 {
362 int nThreads = blockDim.x * gridDim.x;
363 int startID = blockIdx.x * blockDim.x + threadIdx.x;
364 for(int i = startID; i < n; i += nThreads) {
365 a[i].x = b[i] - c.x;
366 a[i].y = 0.0 - c.y;
367 }
368 }
369
370 /*
371 * Vector subtraction, a[i] = b[i] - c, GPU kernel (mixed, c = real).
372 *
373 * \param a output array (LHS)
374 * \param b input array (RHS)
375 * \param c input scalar (RHS)
376 * \param n size of arrays
377 */
378 __global__ void _subVS(cudaComplex* a, cudaComplex const * b,
379 cudaReal const c, const int n)
380 {
381 int nThreads = blockDim.x * gridDim.x;
382 int startID = blockIdx.x * blockDim.x + threadIdx.x;
383 for(int i = startID; i < n; i += nThreads) {
384 a[i].x = b[i].x - c;
385 a[i].y = b[i].y;
386 }
387 }
388
389 /*
390 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (cudaReal).
391 *
392 * \param a output array (LHS)
393 * \param b input array (RHS)
394 * \param c input array (RHS)
395 * \param n size of arrays
396 */
397 __global__ void _mulVV(cudaReal* a, cudaReal const * b,
398 cudaReal const * c, const int n)
399 {
400 int nThreads = blockDim.x * gridDim.x;
401 int startID = blockIdx.x * blockDim.x + threadIdx.x;
402 for(int i = startID; i < n; i += nThreads) {
403 a[i] = b[i] * c[i];
404 }
405 }
406
407 /*
408 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (cudaComplex).
409 *
410 * \param a output array (LHS)
411 * \param b input array (RHS)
412 * \param c input array (RHS)
413 * \param n size of arrays
414 */
415 __global__ void _mulVV(cudaComplex* a, cudaComplex const * b,
416 cudaComplex const * c, const int n)
417 {
418 int nThreads = blockDim.x * gridDim.x;
419 int startID = blockIdx.x * blockDim.x + threadIdx.x;
420 for(int i = startID; i < n; i += nThreads) {
421 a[i].x = (b[i].x * c[i].x) - (b[i].y * c[i].y);
422 a[i].y = (b[i].x * c[i].y) + (b[i].y * c[i].x);
423 }
424 }
425
426 /*
427 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (mixed, b = real).
428 *
429 * \param a output array (LHS)
430 * \param b input array (RHS)
431 * \param c input array (RHS)
432 * \param n size of arrays
433 */
434 __global__ void _mulVV(cudaComplex* a, cudaReal const * b,
435 cudaComplex const * c, const int n)
436 {
437 int nThreads = blockDim.x * gridDim.x;
438 int startID = blockIdx.x * blockDim.x + threadIdx.x;
439 for(int i = startID; i < n; i += nThreads) {
440 a[i].x = b[i] * c[i].x;
441 a[i].y = b[i] * c[i].y;
442 }
443 }
444
445 /*
446 * Vector multiplication, a[i] = b[i] * c[i], GPU kernel (mixed, c = real).
447 *
448 * \param a output array (LHS)
449 * \param b input array (RHS)
450 * \param c input array (RHS)
451 * \param n size of arrays
452 */
453 __global__ void _mulVV(cudaComplex* a, cudaComplex const * b,
454 cudaReal const * c, const int n)
455 {
456 int nThreads = blockDim.x * gridDim.x;
457 int startID = blockIdx.x * blockDim.x + threadIdx.x;
458 for(int i = startID; i < n; i += nThreads) {
459 a[i].x = b[i].x * c[i];
460 a[i].y = b[i].y * c[i];
461 }
462 }
463
464 /*
465 * Vector multiplication, a[i] = b[i] * c, GPU kernel (cudaReal).
466 *
467 * \param a output array (LHS)
468 * \param b input array (RHS)
469 * \param c input scalar (RHS)
470 * \param n size of arrays
471 */
472 __global__ void _mulVS(cudaReal* a, cudaReal const * b,
473 cudaReal const c, const int n)
474 {
475 int nThreads = blockDim.x * gridDim.x;
476 int startID = blockIdx.x * blockDim.x + threadIdx.x;
477 for(int i = startID; i < n; i += nThreads) {
478 a[i] = b[i] * c;
479 }
480 }
481
482 /*
483 * Vector multiplication, a[i] = b[i] * c, GPU kernel (cudaComplex).
484 *
485 * \param a output array (LHS)
486 * \param b input array (RHS)
487 * \param c input scalar (RHS)
488 * \param n size of arrays
489 */
490 __global__ void _mulVS(cudaComplex* a, cudaComplex const * b,
491 cudaComplex const c, const int n)
492 {
493 int nThreads = blockDim.x * gridDim.x;
494 int startID = blockIdx.x * blockDim.x + threadIdx.x;
495 for(int i = startID; i < n; i += nThreads) {
496 a[i].x = (b[i].x * c.x) - (b[i].y * c.y);
497 a[i].y = (b[i].x * c.y) + (b[i].y * c.x);
498 }
499 }
500
501 /*
502 * Vector multiplication, a[i] = b[i] * c, GPU kernel (mixed, b = real).
503 *
504 * \param a output array (LHS)
505 * \param b input array (RHS)
506 * \param c input scalar (RHS)
507 * \param n size of arrays
508 */
509 __global__ void _mulVS(cudaComplex* a, cudaReal const * b,
510 cudaComplex const c, const int n)
511 {
512 int nThreads = blockDim.x * gridDim.x;
513 int startID = blockIdx.x * blockDim.x + threadIdx.x;
514 for(int i = startID; i < n; i += nThreads) {
515 a[i].x = b[i] * c.x;
516 a[i].y = b[i] * c.y;
517 }
518 }
519
520 /*
521 * Vector multiplication, a[i] = b[i] * c, GPU kernel (mixed, c = real).
522 *
523 * \param a output array (LHS)
524 * \param b input array (RHS)
525 * \param c input scalar (RHS)
526 * \param n size of arrays
527 */
528 __global__ void _mulVS(cudaComplex* a, cudaComplex const * b,
529 cudaReal const c, const int n)
530 {
531 int nThreads = blockDim.x * gridDim.x;
532 int startID = blockIdx.x * blockDim.x + threadIdx.x;
533 for(int i = startID; i < n; i += nThreads) {
534 a[i].x = b[i].x * c;
535 a[i].y = b[i].y * c;
536 }
537 }
538
539 /*
540 * Vector division, a[i] = b[i] / c[i], GPU kernel (cudaReal).
541 *
542 * \param a output array (LHS)
543 * \param b input array (RHS)
544 * \param c input array (RHS)
545 * \param n size of arrays
546 */
547 __global__ void _divVV(cudaReal* a, cudaReal const * b,
548 cudaReal const * c, const int n)
549 {
550 int nThreads = blockDim.x * gridDim.x;
551 int startID = blockIdx.x * blockDim.x + threadIdx.x;
552 for(int i = startID; i < n; i += nThreads) {
553 a[i] = b[i] / c[i];
554 }
555 }
556
557 /*
558 * Vector division, a[i] = b[i] / c[i], GPU kernel (mixed, c = real).
559 *
560 * \param a output array (LHS)
561 * \param b input array (RHS)
562 * \param c input array (RHS)
563 * \param n size of arrays
564 */
565 __global__ void _divVV(cudaComplex* a, cudaComplex const * b,
566 cudaReal const * c, const int n)
567 {
568 int nThreads = blockDim.x * gridDim.x;
569 int startID = blockIdx.x * blockDim.x + threadIdx.x;
570 for(int i = startID; i < n; i += nThreads) {
571 a[i].x = b[i].x / c[i];
572 a[i].y = b[i].y / c[i];
573 }
574 }
575
576 /*
577 * Vector division, a[i] = b[i] / c, GPU kernel (cudaReal).
578 *
579 * \param a output array (LHS)
580 * \param b input array (RHS)
581 * \param c input scalar (RHS)
582 * \param n size of arrays
583 */
584 __global__ void _divVS(cudaReal* a, cudaReal const * b,
585 cudaReal const c, const int n)
586 {
587 int nThreads = blockDim.x * gridDim.x;
588 int startID = blockIdx.x * blockDim.x + threadIdx.x;
589 for(int i = startID; i < n; i += nThreads) {
590 a[i] = b[i] / c;
591 }
592 }
593
594 /*
595 * Vector division, a[i] = b[i] / c, GPU kernel (mixed, c = real).
596 *
597 * \param a output array (LHS)
598 * \param b input array (RHS)
599 * \param c input scalar (RHS)
600 * \param n size of arrays
601 */
602 __global__ void _divVS(cudaComplex* a, cudaComplex const * b,
603 cudaReal const c, const int n)
604 {
605 int nThreads = blockDim.x * gridDim.x;
606 int startID = blockIdx.x * blockDim.x + threadIdx.x;
607 for(int i = startID; i < n; i += nThreads) {
608 a[i].x = b[i].x / c;
609 a[i].y = b[i].y / c;
610 }
611 }
612
613 /*
614 * Vector exponentiation, a[i] = exp(b[i]), GPU kernel (cudaReal).
615 *
616 * \param a output array (LHS)
617 * \param b input array (RHS)
618 * \param n size of arrays
619 */
620 __global__ void _expV(cudaReal* a, cudaReal const * b, const int n)
621 {
622 int nThreads = blockDim.x * gridDim.x;
623 int startID = blockIdx.x * blockDim.x + threadIdx.x;
624 for(int i = startID; i < n; i += nThreads) {
625 a[i] = exp(b[i]);
626 }
627 }
628
629 /*
630 * Vector exponentiation, a[i] = exp(b[i]), GPU kernel (cudaComplex).
631 *
632 * \param a output array (LHS)
633 * \param b input array (RHS)
634 * \param n size of arrays
635 */
636 __global__ void _expV(cudaComplex* a, cudaComplex const * b, const int n)
637 {
638 int nThreads = blockDim.x * gridDim.x;
639 int startID = blockIdx.x * blockDim.x + threadIdx.x;
640 for(int i = startID; i < n; i += nThreads) {
641 a[i].x = exp(b[i].x) * cos(b[i].y);
642 a[i].y = exp(b[i].x) * sin(b[i].y);
643 }
644 }
645
646 /*
647 * Vector addition in-place, a[i] += b[i], GPU kernel (cudaReal).
648 *
649 * \param a output array (LHS)
650 * \param b input array (RHS)
651 * \param n size of arrays
652 */
653 __global__ void _addEqV(cudaReal* a, cudaReal const * b, const int n)
654 {
655 int nThreads = blockDim.x * gridDim.x;
656 int startID = blockIdx.x * blockDim.x + threadIdx.x;
657 for(int i = startID; i < n; i += nThreads) {
658 a[i] += b[i];
659 }
660 }
661
662 /*
663 * Vector addition in-place, a[i] += b[i], GPU kernel (cudaComplex).
664 *
665 * \param a output array (LHS)
666 * \param b input array (RHS)
667 * \param n size of arrays
668 */
669 __global__ void _addEqV(cudaComplex* a, cudaComplex const * b, const int n)
670 {
671 int nThreads = blockDim.x * gridDim.x;
672 int startID = blockIdx.x * blockDim.x + threadIdx.x;
673 for(int i = startID; i < n; i += nThreads) {
674 a[i].x += b[i].x;
675 a[i].y += b[i].y;
676 }
677 }
678
679 /*
680 * Vector addition in-place, a[i] += b[i], GPU kernel (mixed, b = real).
681 *
682 * \param a output array (LHS)
683 * \param b input array (RHS)
684 * \param n size of arrays
685 */
686 __global__ void _addEqV(cudaComplex* a, cudaReal const * b, const int n)
687 {
688 int nThreads = blockDim.x * gridDim.x;
689 int startID = blockIdx.x * blockDim.x + threadIdx.x;
690 for(int i = startID; i < n; i += nThreads) {
691 a[i].x += b[i];
692 }
693 }
694
695 /*
696 * Vector addition in-place, a[i] += b, GPU kernel (cudaReal).
697 *
698 * \param a output array (LHS)
699 * \param b input scalar (RHS)
700 * \param n size of arrays
701 */
702 __global__ void _addEqS(cudaReal* a, cudaReal const b, const int n)
703 {
704 int nThreads = blockDim.x * gridDim.x;
705 int startID = blockIdx.x * blockDim.x + threadIdx.x;
706 for(int i = startID; i < n; i += nThreads) {
707 a[i] += b;
708 }
709 }
710
711 /*
712 * Vector addition in-place, a[i] += b, GPU kernel (cudaComplex).
713 *
714 * \param a output array (LHS)
715 * \param b input scalar (RHS)
716 * \param n size of arrays
717 */
718 __global__ void _addEqS(cudaComplex* a, cudaComplex const b, const int n)
719 {
720 int nThreads = blockDim.x * gridDim.x;
721 int startID = blockIdx.x * blockDim.x + threadIdx.x;
722 for(int i = startID; i < n; i += nThreads) {
723 a[i].x += b.x;
724 a[i].y += b.y;
725 }
726 }
727
728 /*
729 * Vector addition in-place, a[i] += b, GPU kernel (mixed, b = real).
730 *
731 * \param a output array (LHS)
732 * \param b input scalar (RHS)
733 * \param n size of arrays
734 */
735 __global__ void _addEqS(cudaComplex* a, cudaReal const b, const int n)
736 {
737 int nThreads = blockDim.x * gridDim.x;
738 int startID = blockIdx.x * blockDim.x + threadIdx.x;
739 for(int i = startID; i < n; i += nThreads) {
740 a[i].x += b;
741 }
742 }
743
744 /*
745 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (cudaReal).
746 *
747 * \param a output array (LHS)
748 * \param b input array (RHS)
749 * \param n size of arrays
750 */
751 __global__ void _subEqV(cudaReal* a, cudaReal const * b, const int n)
752 {
753 int nThreads = blockDim.x * gridDim.x;
754 int startID = blockIdx.x * blockDim.x + threadIdx.x;
755 for(int i = startID; i < n; i += nThreads) {
756 a[i] -= b[i];
757 }
758 }
759
760 /*
761 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (cudaComplex).
762 *
763 * \param a output array (LHS)
764 * \param b input array (RHS)
765 * \param n size of arrays
766 */
767 __global__ void _subEqV(cudaComplex* a, cudaComplex const * b, const int n)
768 {
769 int nThreads = blockDim.x * gridDim.x;
770 int startID = blockIdx.x * blockDim.x + threadIdx.x;
771 for(int i = startID; i < n; i += nThreads) {
772 a[i].x -= b[i].x;
773 a[i].y -= b[i].y;
774 }
775 }
776
777 /*
778 * Vector subtraction in-place, a[i] -= b[i], GPU kernel (mixed, b = real).
779 *
780 * \param a output array (LHS)
781 * \param b input array (RHS)
782 * \param n size of arrays
783 */
784 __global__ void _subEqV(cudaComplex* a, cudaReal const * b, const int n)
785 {
786 int nThreads = blockDim.x * gridDim.x;
787 int startID = blockIdx.x * blockDim.x + threadIdx.x;
788 for(int i = startID; i < n; i += nThreads) {
789 a[i].x -= b[i];
790 }
791 }
792
793 /*
794 * Vector subtraction in-place, a[i] -= b, GPU kernel (cudaReal).
795 *
796 * \param a output array (LHS)
797 * \param b input scalar (RHS)
798 * \param n size of arrays
799 */
800 __global__ void _subEqS(cudaReal* a, cudaReal const b, const int n)
801 {
802 int nThreads = blockDim.x * gridDim.x;
803 int startID = blockIdx.x * blockDim.x + threadIdx.x;
804 for(int i = startID; i < n; i += nThreads) {
805 a[i] -= b;
806 }
807 }
808
809 /*
810 * Vector subtraction in-place, a[i] -= b, GPU kernel (cudaComplex).
811 *
812 * \param a output array (LHS)
813 * \param b input scalar (RHS)
814 * \param n size of arrays
815 */
816 __global__ void _subEqS(cudaComplex* a, cudaComplex const b, const int n)
817 {
818 int nThreads = blockDim.x * gridDim.x;
819 int startID = blockIdx.x * blockDim.x + threadIdx.x;
820 for(int i = startID; i < n; i += nThreads) {
821 a[i].x -= b.x;
822 a[i].y -= b.y;
823 }
824 }
825
826 /*
827 * Vector subtraction in-place, a[i] -= b, GPU kernel (mixed, b = real).
828 *
829 * \param a output array (LHS)
830 * \param b input scalar (RHS)
831 * \param n size of arrays
832 */
833 __global__ void _subEqS(cudaComplex* a, cudaReal const b, const int n)
834 {
835 int nThreads = blockDim.x * gridDim.x;
836 int startID = blockIdx.x * blockDim.x + threadIdx.x;
837 for(int i = startID; i < n; i += nThreads) {
838 a[i].x -= b;
839 }
840 }
841
842 /*
843 * Vector multiplication in-place, a[i] *= b[i], GPU kernel (cudaReal).
844 *
845 * \param a output array (LHS)
846 * \param b input array (RHS)
847 * \param n size of arrays
848 */
849 __global__ void _mulEqV(cudaReal* a, cudaReal const * b, const int n)
850 {
851 int nThreads = blockDim.x * gridDim.x;
852 int startID = blockIdx.x * blockDim.x + threadIdx.x;
853 for(int i = startID; i < n; i += nThreads) {
854 a[i] *= b[i];
855 }
856 }
857
858 /*
859 * Vector multiplication in-place, a[i] *= b[i], GPU kernel (cudaComplex).
860 *
861 * \param a output array (LHS)
862 * \param b input array (RHS)
863 * \param n size of arrays
864 */
865 __global__ void _mulEqV(cudaComplex* a, cudaComplex const * b, const int n)
866 {
867 int nThreads = blockDim.x * gridDim.x;
868 int startID = blockIdx.x * blockDim.x + threadIdx.x;
869 cudaComplex c;
870 for(int i = startID; i < n; i += nThreads) {
871 c.x = (a[i].x * b[i].x) - (a[i].y * b[i].y);
872 c.y = (a[i].x * b[i].y) + (a[i].y * b[i].x);
873 a[i].x = c.x;
874 a[i].y = c.y;
875 }
876 }
877
878 /*
879 * Vector multiplication in-place, a[i]*=b[i], GPU kernel (mixed, b = real).
880 *
881 * \param a output array (LHS)
882 * \param b input array (RHS)
883 * \param n size of arrays
884 */
885 __global__ void _mulEqV(cudaComplex* a, cudaReal const * b, const int n)
886 {
887 int nThreads = blockDim.x * gridDim.x;
888 int startID = blockIdx.x * blockDim.x + threadIdx.x;
889 for(int i = startID; i < n; i += nThreads) {
890 a[i].x *= b[i];
891 a[i].y *= b[i];
892 }
893 }
894
895 /*
896 * Vector multiplication in-place, a[i] *= b, GPU kernel (cudaReal).
897 *
898 * \param a output array (LHS)
899 * \param b input scalar (RHS)
900 * \param n size of arrays
901 */
902 __global__ void _mulEqS(cudaReal* a, cudaReal const b, const int n)
903 {
904 int nThreads = blockDim.x * gridDim.x;
905 int startID = blockIdx.x * blockDim.x + threadIdx.x;
906 for(int i = startID; i < n; i += nThreads) {
907 a[i] *= b;
908 }
909 }
910
911 /*
912 * Vector multiplication in-place, a[i] *= b, GPU kernel (cudaComplex).
913 *
914 * \param a output array (LHS)
915 * \param b input scalar (RHS)
916 * \param n size of arrays
917 */
918 __global__ void _mulEqS(cudaComplex* a, cudaComplex const b, const int n)
919 {
920 int nThreads = blockDim.x * gridDim.x;
921 int startID = blockIdx.x * blockDim.x + threadIdx.x;
922 cudaComplex c;
923 for(int i = startID; i < n; i += nThreads) {
924 c.x = (a[i].x * b.x) - (a[i].y * b.y);
925 c.y = (a[i].x * b.y) + (a[i].y * b.x);
926 a[i].x = c.x;
927 a[i].y = c.y;
928 }
929 }
930
931 /*
932 * Vector multiplication in-place, a[i] *= b, GPU kernel (mixed, b = real).
933 *
934 * \param a output array (LHS)
935 * \param b input scalar (RHS)
936 * \param n size of arrays
937 */
938 __global__ void _mulEqS(cudaComplex* a, cudaReal const b, const int n)
939 {
940 int nThreads = blockDim.x * gridDim.x;
941 int startID = blockIdx.x * blockDim.x + threadIdx.x;
942 for(int i = startID; i < n; i += nThreads) {
943 a[i].x *= b;
944 a[i].y *= b;
945 }
946 }
947
948 /*
949 * Vector division in-place, a[i] /= b[i], GPU kernel (cudaReal).
950 *
951 * \param a output array (LHS)
952 * \param b input array (RHS)
953 * \param n size of arrays
954 */
955 __global__ void _divEqV(cudaReal* a, cudaReal const * b, const int n)
956 {
957 int nThreads = blockDim.x * gridDim.x;
958 int startID = blockIdx.x * blockDim.x + threadIdx.x;
959 for(int i = startID; i < n; i += nThreads) {
960 a[i] /= b[i];
961 }
962 }
963
964 /*
965 * Vector division in-place, a[i] /= b[i], GPU kernel (mixed, b = real).
966 *
967 * \param a output array (LHS)
968 * \param b input array (RHS)
969 * \param n size of arrays
970 */
971 __global__ void _divEqV(cudaComplex* a, cudaReal const * b, const int n)
972 {
973 int nThreads = blockDim.x * gridDim.x;
974 int startID = blockIdx.x * blockDim.x + threadIdx.x;
975 for(int i = startID; i < n; i += nThreads) {
976 a[i].x /= b[i];
977 a[i].y /= b[i];
978 }
979 }
980
981 /*
982 * Vector division in-place, a[i] /= b, GPU kernel (cudaReal).
983 *
984 * \param a output array (LHS)
985 * \param b input scalar (RHS)
986 * \param n size of arrays
987 */
988 __global__ void _divEqS(cudaReal* a, cudaReal const b, const int n)
989 {
990 int nThreads = blockDim.x * gridDim.x;
991 int startID = blockIdx.x * blockDim.x + threadIdx.x;
992 for(int i = startID; i < n; i += nThreads) {
993 a[i] /= b;
994 }
995 }
996
997 /*
998 * Vector division in-place, a[i] /= b, GPU kernel (mixed, b = real).
999 *
1000 * \param a output array (LHS)
1001 * \param b input scalar (RHS)
1002 * \param n size of arrays
1003 */
1004 __global__ void _divEqS(cudaComplex* a, cudaReal const b, const int n)
1005 {
1006 int nThreads = blockDim.x * gridDim.x;
1007 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1008 for(int i = startID; i < n; i += nThreads) {
1009 a[i].x /= b;
1010 a[i].y /= b;
1011 }
1012 }
1013
1014} // end anonymous namespace
1015
1016
1017// CUDA kernel wrappers:
1018
1019// Vector assignment, a[i] = b[i], kernel wrapper (cudaReal).
1021 const int beginIdA, const int beginIdB, const int n)
1022{
1023 UTIL_CHECK(a.capacity() >= n + beginIdA);
1024 UTIL_CHECK(b.capacity() >= n + beginIdB);
1025
1026 // GPU resources
1027 int nBlocks, nThreads;
1028 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1029
1030 // Launch kernel
1031 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1032 b.cArray()+beginIdB, n);
1033 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1034}
1035
1036// Vector assignment, a[i] = b[i], kernel wrapper (cudaComplex).
1038 const int beginIdA, const int beginIdB, const int n)
1039{
1040 UTIL_CHECK(a.capacity() >= n + beginIdA);
1041 UTIL_CHECK(b.capacity() >= n + beginIdB);
1042
1043 // GPU resources
1044 int nBlocks, nThreads;
1045 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1046
1047 // Launch kernel
1048 _eqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1049 b.cArray()+beginIdB, n);
1050 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1051}
1052
1053// Vector assignment, a[i] = b, kernel wrapper (cudaReal).
1054void eqS(DeviceArray<cudaReal>& a, cudaReal const b,
1055 const int beginIdA, const int n)
1056{
1057 UTIL_CHECK(a.capacity() >= n + beginIdA);
1058
1059 // GPU resources
1060 int nBlocks, nThreads;
1061 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1062
1063 // Launch kernel
1064 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1065 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1066}
1067
1068// Vector assignment, a[i] = b, kernel wrapper (cudaComplex).
1069void eqS(DeviceArray<cudaComplex>& a, cudaComplex const b,
1070 const int beginIdA, const int n)
1071{
1072 UTIL_CHECK(a.capacity() >= n + beginIdA);
1073
1074 // GPU resources
1075 int nBlocks, nThreads;
1076 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1077
1078 // Launch kernel
1079 _eqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1080 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1081}
1082
1083// Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
1085 DeviceArray<cudaReal> const & c, const int beginIdA,
1086 const int beginIdB, const int beginIdC, const int n)
1087{
1088 UTIL_CHECK(a.capacity() >= n + beginIdA);
1089 UTIL_CHECK(b.capacity() >= n + beginIdB);
1090 UTIL_CHECK(c.capacity() >= n + beginIdC);
1091
1092 // GPU resources
1093 int nBlocks, nThreads;
1094 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1095
1096 // Launch kernel
1097 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1098 c.cArray()+beginIdC, n);
1099 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1100}
1101
1102// Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaComplex).
1104 DeviceArray<cudaComplex> const & c, const int beginIdA,
1105 const int beginIdB, const int beginIdC, const int n)
1106{
1107 UTIL_CHECK(a.capacity() >= n + beginIdA);
1108 UTIL_CHECK(b.capacity() >= n + beginIdB);
1109 UTIL_CHECK(c.capacity() >= n + beginIdC);
1110
1111 // GPU resources
1112 int nBlocks, nThreads;
1113 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1114
1115 // Launch kernel
1116 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1117 c.cArray()+beginIdC, n);
1118 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1119}
1120
1121// Vector addition, a[i] = b[i] + c[i], kernel wrapper (mixed, b = real).
1123 DeviceArray<cudaComplex> const & c, const int beginIdA,
1124 const int beginIdB, const int beginIdC, const int n)
1125{
1126 UTIL_CHECK(a.capacity() >= n + beginIdA);
1127 UTIL_CHECK(b.capacity() >= n + beginIdB);
1128 UTIL_CHECK(c.capacity() >= n + beginIdC);
1129
1130 // GPU resources
1131 int nBlocks, nThreads;
1132 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1133
1134 // Launch kernel
1135 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1136 c.cArray()+beginIdC, n);
1137 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1138}
1139
1140// Vector addition, a[i] = b[i] + c[i], kernel wrapper (mixed, c = real).
1142 DeviceArray<cudaReal> const & c, const int beginIdA,
1143 const int beginIdB, const int beginIdC, const int n)
1144{
1145 UTIL_CHECK(a.capacity() >= n + beginIdA);
1146 UTIL_CHECK(b.capacity() >= n + beginIdB);
1147 UTIL_CHECK(c.capacity() >= n + beginIdC);
1148
1149 // GPU resources
1150 int nBlocks, nThreads;
1151 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1152
1153 // Launch kernel
1154 _addVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1155 c.cArray()+beginIdC, n);
1156 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1157}
1158
1159// Vector addition, a[i] = b[i] + c, kernel wrapper (cudaReal).
1161 cudaReal const c, const int beginIdA, const int beginIdB,int n)
1162{
1163 UTIL_CHECK(a.capacity() >= n + beginIdA);
1164 UTIL_CHECK(b.capacity() >= n + beginIdB);
1165
1166 // GPU resources
1167 int nBlocks, nThreads;
1168 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1169
1170 // Launch kernel
1171 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1172 c, n);
1173 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1174}
1175
1176// Vector addition, a[i] = b[i] + c, kernel wrapper (cudaComplex).
1178 cudaComplex const c, const int beginIdA, const int beginIdB,int n)
1179{
1180 UTIL_CHECK(a.capacity() >= n + beginIdA);
1181 UTIL_CHECK(b.capacity() >= n + beginIdB);
1182
1183 // GPU resources
1184 int nBlocks, nThreads;
1185 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1186
1187 // Launch kernel
1188 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1189 c, n);
1190 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1191}
1192
1193// Vector addition, a[i] = b[i] + c, kernel wrapper (mixed, b = real).
1195 cudaComplex const c, const int beginIdA, const int beginIdB,int n)
1196{
1197 UTIL_CHECK(a.capacity() >= n + beginIdA);
1198 UTIL_CHECK(b.capacity() >= n + beginIdB);
1199
1200 // GPU resources
1201 int nBlocks, nThreads;
1202 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1203
1204 // Launch kernel
1205 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1206 c, n);
1207 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1208}
1209
1210// Vector addition, a[i] = b[i] + c, kernel wrapper (mixed, c = real).
1212 cudaReal const c, const int beginIdA, const int beginIdB,int n)
1213{
1214 UTIL_CHECK(a.capacity() >= n + beginIdA);
1215 UTIL_CHECK(b.capacity() >= n + beginIdB);
1216
1217 // GPU resources
1218 int nBlocks, nThreads;
1219 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1220
1221 // Launch kernel
1222 _addVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1223 c, n);
1224 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1225}
1226
1227// Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaReal).
1229 DeviceArray<cudaReal> const & c, const int beginIdA,
1230 const int beginIdB, const int beginIdC, const int n)
1231{
1232 UTIL_CHECK(a.capacity() >= n + beginIdA);
1233 UTIL_CHECK(b.capacity() >= n + beginIdB);
1234 UTIL_CHECK(c.capacity() >= n + beginIdC);
1235
1236 // GPU resources
1237 int nBlocks, nThreads;
1238 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1239
1240 // Launch kernel
1241 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1242 c.cArray()+beginIdC, n);
1243 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1244}
1245
1246// Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaComplex).
1248 DeviceArray<cudaComplex> const & c, const int beginIdA,
1249 const int beginIdB, const int beginIdC, const int n)
1250{
1251 UTIL_CHECK(a.capacity() >= n + beginIdA);
1252 UTIL_CHECK(b.capacity() >= n + beginIdB);
1253 UTIL_CHECK(c.capacity() >= n + beginIdC);
1254
1255 // GPU resources
1256 int nBlocks, nThreads;
1257 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1258
1259 // Launch kernel
1260 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1261 c.cArray()+beginIdC, n);
1262 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1263}
1264
1265// Vector subtraction, a[i]=b[i]-c[i], kernel wrapper (mixed, b=real).
1267 DeviceArray<cudaComplex> const & c, const int beginIdA,
1268 const int beginIdB, const int beginIdC, const int n)
1269{
1270 UTIL_CHECK(a.capacity() >= n + beginIdA);
1271 UTIL_CHECK(b.capacity() >= n + beginIdB);
1272 UTIL_CHECK(c.capacity() >= n + beginIdC);
1273
1274 // GPU resources
1275 int nBlocks, nThreads;
1276 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1277
1278 // Launch kernel
1279 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1280 c.cArray()+beginIdC, n);
1281 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1282}
1283
1284// Vector subtraction, a[i]=b[i]-c[i], kernel wrapper (mixed, c=real).
1286 DeviceArray<cudaReal> const & c, const int beginIdA,
1287 const int beginIdB, const int beginIdC, const int n)
1288{
1289 UTIL_CHECK(a.capacity() >= n + beginIdA);
1290 UTIL_CHECK(b.capacity() >= n + beginIdB);
1291 UTIL_CHECK(c.capacity() >= n + beginIdC);
1292
1293 // GPU resources
1294 int nBlocks, nThreads;
1295 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1296
1297 // Launch kernel
1298 _subVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1299 c.cArray()+beginIdC, n);
1300 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1301}
1302
1303// Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaReal).
1305 DeviceArray<cudaReal> const & b, cudaReal const c,
1306 const int beginIdA, const int beginIdB, const int n)
1307{
1308 UTIL_CHECK(a.capacity() >= n + beginIdA);
1309 UTIL_CHECK(b.capacity() >= n + beginIdB);
1310
1311 // GPU resources
1312 int nBlocks, nThreads;
1313 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1314
1315 // Launch kernel
1316 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1317 c, n);
1318 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1319}
1320
1321// Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaComplex).
1323 cudaComplex const c, const int beginIdA, const int beginIdB,
1324 const int n)
1325{
1326 UTIL_CHECK(a.capacity() >= n + beginIdA);
1327 UTIL_CHECK(b.capacity() >= n + beginIdB);
1328
1329 // GPU resources
1330 int nBlocks, nThreads;
1331 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1332
1333 // Launch kernel
1334 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1335 c, n);
1336 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1337}
1338
1339// Vector subtraction, a[i] = b[i] - c, kernel wrapper (mixed, b = real).
1341 cudaComplex const c, const int beginIdA, const int beginIdB,
1342 const int n)
1343{
1344 UTIL_CHECK(a.capacity() >= n + beginIdA);
1345 UTIL_CHECK(b.capacity() >= n + beginIdB);
1346
1347 // GPU resources
1348 int nBlocks, nThreads;
1349 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1350
1351 // Launch kernel
1352 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1353 c, n);
1354 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1355}
1356
1357// Vector subtraction, a[i] = b[i] - c, kernel wrapper (mixed, c = real).
1359 cudaReal const c, const int beginIdA, const int beginIdB,
1360 const int n)
1361{
1362 UTIL_CHECK(a.capacity() >= n + beginIdA);
1363 UTIL_CHECK(b.capacity() >= n + beginIdB);
1364
1365 // GPU resources
1366 int nBlocks, nThreads;
1367 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1368
1369 // Launch kernel
1370 _subVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1371 c, n);
1372 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1373}
1374
1375// Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
1377 DeviceArray<cudaReal> const & c, const int beginIdA,
1378 const int beginIdB, const int beginIdC, const int n)
1379{
1380 UTIL_CHECK(a.capacity() >= n + beginIdA);
1381 UTIL_CHECK(b.capacity() >= n + beginIdB);
1382 UTIL_CHECK(c.capacity() >= n + beginIdC);
1383
1384 // GPU resources
1385 int nBlocks, nThreads;
1386 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1387
1388 // Launch kernel
1389 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1390 c.cArray()+beginIdC, n);
1391 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1392}
1393
1394// Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaComplex).
1396 DeviceArray<cudaComplex> const & c, const int beginIdA,
1397 const int beginIdB, const int beginIdC, const int n)
1398{
1399 UTIL_CHECK(a.capacity() >= n + beginIdA);
1400 UTIL_CHECK(b.capacity() >= n + beginIdB);
1401 UTIL_CHECK(c.capacity() >= n + beginIdC);
1402
1403 // GPU resources
1404 int nBlocks, nThreads;
1405 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1406
1407 // Launch kernel
1408 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1409 c.cArray()+beginIdC, n);
1410 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1411}
1412
1413// Vector multiplication, a[i]=b[i]*c[i], kernel wrapper (mixed, b = real).
1415 DeviceArray<cudaComplex> const & c, const int beginIdA,
1416 const int beginIdB, const int beginIdC, const int n)
1417{
1418 UTIL_CHECK(a.capacity() >= n + beginIdA);
1419 UTIL_CHECK(b.capacity() >= n + beginIdB);
1420 UTIL_CHECK(c.capacity() >= n + beginIdC);
1421
1422 // GPU resources
1423 int nBlocks, nThreads;
1424 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1425
1426 // Launch kernel
1427 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1428 c.cArray()+beginIdC, n);
1429 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1430}
1431
1432// Vector multiplication, a[i]=b[i]*c[i], kernel wrapper (mixed, c = real).
1434 DeviceArray<cudaReal> const & c, const int beginIdA,
1435 const int beginIdB, const int beginIdC, const int n)
1436{
1437 UTIL_CHECK(a.capacity() >= n + beginIdA);
1438 UTIL_CHECK(b.capacity() >= n + beginIdB);
1439 UTIL_CHECK(c.capacity() >= n + beginIdC);
1440
1441 // GPU resources
1442 int nBlocks, nThreads;
1443 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1444
1445 // Launch kernel
1446 _mulVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1447 c.cArray()+beginIdC, n);
1448 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1449}
1450
1451// Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaReal).
1453 cudaReal const c, const int beginIdA, const int beginIdB,
1454 const int n)
1455{
1456 UTIL_CHECK(a.capacity() >= n + beginIdA);
1457 UTIL_CHECK(b.capacity() >= n + beginIdB);
1458
1459 // GPU resources
1460 int nBlocks, nThreads;
1461 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1462
1463 // Launch kernel
1464 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1465 c, n);
1466 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1467}
1468
1469// Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaComplex).
1471 cudaComplex const c, const int beginIdA, const int beginIdB,
1472 const int n)
1473{
1474 UTIL_CHECK(a.capacity() >= n + beginIdA);
1475 UTIL_CHECK(b.capacity() >= n + beginIdB);
1476
1477 // GPU resources
1478 int nBlocks, nThreads;
1479 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1480
1481 // Launch kernel
1482 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1483 c, n);
1484 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1485}
1486
1487// Vector multiplication, a[i] = b[i] * c, kernel wrapper (mixed, b = real).
1489 cudaComplex const c, const int beginIdA, const int beginIdB,
1490 const int n)
1491{
1492 UTIL_CHECK(a.capacity() >= n + beginIdA);
1493 UTIL_CHECK(b.capacity() >= n + beginIdB);
1494
1495 // GPU resources
1496 int nBlocks, nThreads;
1497 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1498
1499 // Launch kernel
1500 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1501 c, n);
1502 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1503}
1504
1505// Vector multiplication, a[i] = b[i] * c, kernel wrapper (mixed, c = real).
1507 cudaReal const c, const int beginIdA, const int beginIdB,
1508 const int n)
1509{
1510 UTIL_CHECK(a.capacity() >= n + beginIdA);
1511 UTIL_CHECK(b.capacity() >= n + beginIdB);
1512
1513 // GPU resources
1514 int nBlocks, nThreads;
1515 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1516
1517 // Launch kernel
1518 _mulVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1519 c, n);
1520 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1521}
1522
1523// Vector division, a[i] = b[i] / c[i], kernel wrapper (cudaReal).
1525 DeviceArray<cudaReal> const & c, const int beginIdA,
1526 const int beginIdB, const int beginIdC, const int n)
1527{
1528 UTIL_CHECK(a.capacity() >= n + beginIdA);
1529 UTIL_CHECK(b.capacity() >= n + beginIdB);
1530 UTIL_CHECK(c.capacity() >= n + beginIdC);
1531
1532 // GPU resources
1533 int nBlocks, nThreads;
1534 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1535
1536 // Launch kernel
1537 _divVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1538 c.cArray()+beginIdC, n);
1539 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1540}
1541
1542// Vector division, a[i] = b[i] / c[i], kernel wrapper (mixed, c = real).
1544 DeviceArray<cudaReal> const & c, const int beginIdA,
1545 const int beginIdB, const int beginIdC, const int n)
1546{
1547 UTIL_CHECK(a.capacity() >= n + beginIdA);
1548 UTIL_CHECK(b.capacity() >= n + beginIdB);
1549 UTIL_CHECK(c.capacity() >= n + beginIdC);
1550
1551 // GPU resources
1552 int nBlocks, nThreads;
1553 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1554
1555 // Launch kernel
1556 _divVV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1557 c.cArray()+beginIdC, n);
1558 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1559}
1560
1561// Vector division, a[i] = b[i] / c, kernel wrapper (cudaReal).
1563 cudaReal const c, const int beginIdA, const int beginIdB,
1564 const int n)
1565{
1566 UTIL_CHECK(a.capacity() >= n + beginIdA);
1567 UTIL_CHECK(b.capacity() >= n + beginIdB);
1568
1569 // GPU resources
1570 int nBlocks, nThreads;
1571 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1572
1573 // Launch kernel
1574 _divVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1575 c, n);
1576 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1577}
1578
1579// Vector division, a[i] = b[i] / c, kernel wrapper (mixed, c = real).
1581 cudaReal const c, const int beginIdA, const int beginIdB,
1582 const int n)
1583{
1584 UTIL_CHECK(a.capacity() >= n + beginIdA);
1585 UTIL_CHECK(b.capacity() >= n + beginIdB);
1586
1587 // GPU resources
1588 int nBlocks, nThreads;
1589 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1590
1591 // Launch kernel
1592 _divVS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b.cArray()+beginIdB,
1593 c, n);
1594 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1595}
1596
1597// Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaReal).
1599 const int beginIdA, const int beginIdB, const int n)
1600{
1601 UTIL_CHECK(a.capacity() >= n + beginIdA);
1602 UTIL_CHECK(b.capacity() >= n + beginIdB);
1603
1604 // GPU resources
1605 int nBlocks, nThreads;
1606 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1607
1608 // Launch kernel
1609 _expV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1610 b.cArray()+beginIdB, n);
1611 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1612}
1613
1614// Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaComplex).
1616 const int beginIdA, const int beginIdB, const int n)
1617{
1618 UTIL_CHECK(a.capacity() >= n + beginIdA);
1619 UTIL_CHECK(b.capacity() >= n + beginIdB);
1620
1621 // GPU resources
1622 int nBlocks, nThreads;
1623 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1624
1625 // Launch kernel
1626 _expV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1627 b.cArray()+beginIdB, n);
1628 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1629}
1630
1631// Vector addition in-place, a[i] += b[i], kernel wrapper (cudaReal).
1633 const int beginIdA, const int beginIdB, const int n)
1634{
1635 UTIL_CHECK(a.capacity() >= n + beginIdA);
1636 UTIL_CHECK(b.capacity() >= n + beginIdB);
1637
1638 // GPU resources
1639 int nBlocks, nThreads;
1640 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1641
1642 // Launch kernel
1643 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1644 b.cArray()+beginIdB, n);
1645 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1646}
1647
1648// Vector addition in-place, a[i] += b[i], kernel wrapper (cudaComplex).
1650 const int beginIdA, const int beginIdB, const int n)
1651{
1652 UTIL_CHECK(a.capacity() >= n + beginIdA);
1653 UTIL_CHECK(b.capacity() >= n + beginIdB);
1654
1655 // GPU resources
1656 int nBlocks, nThreads;
1657 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1658
1659 // Launch kernel
1660 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1661 b.cArray()+beginIdB, n);
1662 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1663}
1664
1665// Vector addition in-place, a[i] += b[i], kernel wrapper (mixed).
1667 const int beginIdA, const int beginIdB, const int n)
1668{
1669 UTIL_CHECK(a.capacity() >= n + beginIdA);
1670 UTIL_CHECK(b.capacity() >= n + beginIdB);
1671
1672 // GPU resources
1673 int nBlocks, nThreads;
1674 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1675
1676 // Launch kernel
1677 _addEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1678 b.cArray()+beginIdB, n);
1679 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1680}
1681
1682// Vector addition in-place, a[i] += b, kernel wrapper (cudaReal).
1683void addEqS(DeviceArray<cudaReal>& a, cudaReal const b,
1684 const int beginIdA, const int n)
1685{
1686 UTIL_CHECK(a.capacity() >= n + beginIdA);
1687
1688 // GPU resources
1689 int nBlocks, nThreads;
1690 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1691
1692 // Launch kernel
1693 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1694 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1695}
1696
1697// Vector addition in-place, a[i] += b, kernel wrapper (cudaComplex).
1698void addEqS(DeviceArray<cudaComplex>& a, cudaComplex const b,
1699 const int beginIdA, const int n)
1700{
1701 UTIL_CHECK(a.capacity() >= n + beginIdA);
1702
1703 // GPU resources
1704 int nBlocks, nThreads;
1705 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1706
1707 // Launch kernel
1708 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1709 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1710}
1711
1712// Vector addition in-place, a[i] += b, kernel wrapper (mixed).
1713void addEqS(DeviceArray<cudaComplex>& a, cudaReal const b,
1714 const int beginIdA, const int n)
1715{
1716 UTIL_CHECK(a.capacity() >= n + beginIdA);
1717
1718 // GPU resources
1719 int nBlocks, nThreads;
1720 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1721
1722 // Launch kernel
1723 _addEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1724 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1725}
1726
1727// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaReal).
1729 const int beginIdA, const int beginIdB, const int n)
1730{
1731 UTIL_CHECK(a.capacity() >= n + beginIdA);
1732 UTIL_CHECK(b.capacity() >= n + beginIdB);
1733
1734 // GPU resources
1735 int nBlocks, nThreads;
1736 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1737
1738 // Launch kernel
1739 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1740 b.cArray()+beginIdB, n);
1741 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1742}
1743
1744// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaComplex).
1746 const int beginIdA, const int beginIdB, const int n)
1747{
1748 UTIL_CHECK(a.capacity() >= n + beginIdA);
1749 UTIL_CHECK(b.capacity() >= n + beginIdB);
1750
1751 // GPU resources
1752 int nBlocks, nThreads;
1753 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1754
1755 // Launch kernel
1756 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1757 b.cArray()+beginIdB, n);
1758 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1759}
1760
1761// Vector subtraction in-place, a[i] -= b[i], kernel wrapper (mixed).
1763 const int beginIdA, const int beginIdB, const int n)
1764{
1765 UTIL_CHECK(a.capacity() >= n + beginIdA);
1766 UTIL_CHECK(b.capacity() >= n + beginIdB);
1767
1768 // GPU resources
1769 int nBlocks, nThreads;
1770 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1771
1772 // Launch kernel
1773 _subEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1774 b.cArray()+beginIdB, n);
1775 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1776}
1777
1778// Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaReal).
1779void subEqS(DeviceArray<cudaReal>& a, cudaReal const b,
1780 const int beginIdA, const int n)
1781{
1782 UTIL_CHECK(a.capacity() >= n + beginIdA);
1783
1784 // GPU resources
1785 int nBlocks, nThreads;
1786 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1787
1788 // Launch kernel
1789 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1790 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1791}
1792
1793// Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaComplex).
1794void subEqS(DeviceArray<cudaComplex>& a, cudaComplex const b,
1795 const int beginIdA, const int n)
1796{
1797 UTIL_CHECK(a.capacity() >= n + beginIdA);
1798
1799 // GPU resources
1800 int nBlocks, nThreads;
1801 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1802
1803 // Launch kernel
1804 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1805 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1806}
1807
1808// Vector subtraction in-place, a[i] -= b, kernel wrapper (mixed).
1809void subEqS(DeviceArray<cudaComplex>& a, cudaReal const b,
1810 const int beginIdA, const int n)
1811{
1812 UTIL_CHECK(a.capacity() >= n + beginIdA);
1813
1814 // GPU resources
1815 int nBlocks, nThreads;
1816 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1817
1818 // Launch kernel
1819 _subEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1820 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1821}
1822
1823// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaReal).
1825 const int beginIdA, const int beginIdB, const int n)
1826{
1827 UTIL_CHECK(a.capacity() >= n + beginIdA);
1828 UTIL_CHECK(b.capacity() >= n + beginIdB);
1829
1830 // GPU resources
1831 int nBlocks, nThreads;
1832 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1833
1834 // Launch kernel
1835 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1836 b.cArray()+beginIdB, n);
1837 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1838}
1839
1840// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaComplex).
1842 const int beginIdA, const int beginIdB, const int n)
1843{
1844 UTIL_CHECK(a.capacity() >= n + beginIdA);
1845 UTIL_CHECK(b.capacity() >= n + beginIdB);
1846
1847 // GPU resources
1848 int nBlocks, nThreads;
1849 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1850
1851 // Launch kernel
1852 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1853 b.cArray()+beginIdB, n);
1854 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1855}
1856
1857// Vector multiplication in-place, a[i] *= b[i], kernel wrapper (mixed).
1859 const int beginIdA, const int beginIdB, const int n)
1860{
1861 UTIL_CHECK(a.capacity() >= n + beginIdA);
1862 UTIL_CHECK(b.capacity() >= n + beginIdB);
1863
1864 // GPU resources
1865 int nBlocks, nThreads;
1866 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1867
1868 // Launch kernel
1869 _mulEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1870 b.cArray()+beginIdB, n);
1871 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1872}
1873
1874// Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaReal).
1875void mulEqS(DeviceArray<cudaReal>& a, cudaReal const b,
1876 const int beginIdA, const int n)
1877{
1878 UTIL_CHECK(a.capacity() >= n + beginIdA);
1879
1880 // GPU resources
1881 int nBlocks, nThreads;
1882 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1883
1884 // Launch kernel
1885 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1886 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1887}
1888
1889// Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaComplex).
1890void mulEqS(DeviceArray<cudaComplex>& a, cudaComplex const b,
1891 const int beginIdA, const int n)
1892{
1893 UTIL_CHECK(a.capacity() >= n + beginIdA);
1894
1895 // GPU resources
1896 int nBlocks, nThreads;
1897 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1898
1899 // Launch kernel
1900 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1901 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1902}
1903
1904// Vector multiplication in-place, a[i] *= b, kernel wrapper (mixed).
1905void mulEqS(DeviceArray<cudaComplex>& a, cudaReal const b,
1906 const int beginIdA, const int n)
1907{
1908 UTIL_CHECK(a.capacity() >= n + beginIdA);
1909
1910 // GPU resources
1911 int nBlocks, nThreads;
1912 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1913
1914 // Launch kernel
1915 _mulEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1916 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1917}
1918
1919// Vector division in-place, a[i] /= b[i], kernel wrapper (cudaReal).
1921 const int beginIdA, const int beginIdB, const int n)
1922{
1923 UTIL_CHECK(a.capacity() >= n + beginIdA);
1924 UTIL_CHECK(b.capacity() >= n + beginIdB);
1925
1926 // GPU resources
1927 int nBlocks, nThreads;
1928 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1929
1930 // Launch kernel
1931 _divEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1932 b.cArray()+beginIdB, n);
1933 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1934}
1935
1936// Vector division in-place, a[i] /= b[i], kernel wrapper (mixed).
1938 const int beginIdA, const int beginIdB, const int n)
1939{
1940 UTIL_CHECK(a.capacity() >= n + beginIdA);
1941 UTIL_CHECK(b.capacity() >= n + beginIdB);
1942
1943 // GPU resources
1944 int nBlocks, nThreads;
1945 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1946
1947 // Launch kernel
1948 _divEqV<<<nBlocks, nThreads>>>(a.cArray()+beginIdA,
1949 b.cArray()+beginIdB, n);
1950 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1951}
1952
1953// Vector division in-place, a[i] /= b, kernel wrapper (cudaReal).
1954void divEqS(DeviceArray<cudaReal>& a, cudaReal const b,
1955 const int beginIdA, const int n)
1956{
1957 UTIL_CHECK(a.capacity() >= n + beginIdA);
1958
1959 // GPU resources
1960 int nBlocks, nThreads;
1961 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1962
1963 // Launch kernel
1964 _divEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1965 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1966}
1967
1968// Vector division in-place, a[i] /= b, kernel wrapper (mixed).
1969void divEqS(DeviceArray<cudaComplex>& a, cudaReal const b,
1970 const int beginIdA, const int n)
1971{
1972 UTIL_CHECK(a.capacity() >= n + beginIdA);
1973
1974 // GPU resources
1975 int nBlocks, nThreads;
1976 ThreadArray::setThreadsLogical(n, nBlocks, nThreads);
1977
1978 // Launch kernel
1979 _divEqS<<<nBlocks, nThreads>>>(a.cArray()+beginIdA, b, n);
1980 cudaErrorCheck( cudaGetLastError() ); // ensure no CUDA errors
1981}
1982
1983} // namespace VecOp
1984} // namespace Cuda
1985} // namespace Prdc
1986} // namespace Pscf
Dynamic array on the GPU device with aligned data.
Definition rpg/System.h:32
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
void addEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector addition in-place, a[i] += b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1632
void eqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector assignment, a[i] = b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1020
void subEqS(DeviceArray< cudaReal > &a, cudaReal const b, const int beginIdA, const int n)
Vector subtraction in-place, a[i] -= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1779
void addEqS(DeviceArray< cudaReal > &a, cudaReal const b, const int beginIdA, const int n)
Vector addition in-place, a[i] += b, kernel wrapper (cudaReal).
Definition VecOp.cu:1683
void subVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, const int beginIdA, const int beginIdB, const int n)
Vector subtraction, a[i] = b[i] - c, kernel wrapper (cudaReal).
Definition VecOp.cu:1304
void divEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector division in-place, a[i] /= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1920
void divVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector division, a[i] = b[i] / c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1524
void addVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector addition, a[i] = b[i] + c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1084
void mulVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector multiplication, a[i] = b[i] * c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1376
void subVV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, DeviceArray< cudaReal > const &c, const int beginIdA, const int beginIdB, const int beginIdC, const int n)
Vector subtraction, a[i] = b[i] - c[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1228
void addVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, const int beginIdA, const int beginIdB, int n)
Vector addition, a[i] = b[i] + c, kernel wrapper (cudaReal).
Definition VecOp.cu:1160
void mulEqS(DeviceArray< cudaReal > &a, cudaReal const b, const int beginIdA, const int n)
Vector multiplication in-place, a[i] *= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1875
void mulEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector multiplication in-place, a[i] *= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1824
void subEqV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector subtraction in-place, a[i] -= b[i], kernel wrapper (cudaReal).
Definition VecOp.cu:1728
void eqS(DeviceArray< cudaReal > &a, cudaReal const b, const int beginIdA, const int n)
Vector assignment, a[i] = b, kernel wrapper (cudaReal).
Definition VecOp.cu:1054
void mulVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, const int beginIdA, const int beginIdB, const int n)
Vector multiplication, a[i] = b[i] * c, kernel wrapper (cudaReal).
Definition VecOp.cu:1452
void divVS(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, cudaReal const c, const int beginIdA, const int beginIdB, const int n)
Vector division, a[i] = b[i] / c, kernel wrapper (cudaReal).
Definition VecOp.cu:1562
void expV(DeviceArray< cudaReal > &a, DeviceArray< cudaReal > const &b, const int beginIdA, const int beginIdB, const int n)
Vector exponentiation, a[i] = exp(b[i]), kernel wrapper (cudaReal).
Definition VecOp.cu:1598
void divEqS(DeviceArray< cudaReal > &a, cudaReal const b, const int beginIdA, const int n)
Vector division in-place, a[i] /= b, kernel wrapper (cudaReal).
Definition VecOp.cu:1954
PSCF package top-level namespace.
Definition param_pc.dox:1