9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/cudaErrorCheck.h>
32 int nThreads = blockDim.x * gridDim.x;
33 int startID = blockIdx.x * blockDim.x + threadIdx.x;
34 for (
int i = startID; i < n; i +=
nThreads) {
50 int nThreads = blockDim.x * gridDim.x;
51 int startID = blockIdx.x * blockDim.x + threadIdx.x;
52 for (
int i = startID; i < n; i +=
nThreads) {
71 int nThreads = blockDim.x * gridDim.x;
72 int startID = blockIdx.x * blockDim.x + threadIdx.x;
73 for (
int i = startID; i < n; i +=
nThreads) {
89 int nThreads = blockDim.x * gridDim.x;
90 int startID = blockIdx.x * blockDim.x + threadIdx.x;
91 for (
int i = startID; i < n; i +=
nThreads) {
106 int nThreads = blockDim.x * gridDim.x;
107 int startID = blockIdx.x * blockDim.x + threadIdx.x;
108 for (
int i = startID; i < n; i +=
nThreads) {
127 int nThreads = blockDim.x * gridDim.x;
128 int startID = blockIdx.x * blockDim.x + threadIdx.x;
129 for (
int i = startID; i < n; i +=
nThreads) {
147 int nThreads = blockDim.x * gridDim.x;
148 int startID = blockIdx.x * blockDim.x + threadIdx.x;
149 for (
int i = startID; i < n; i +=
nThreads) {
150 a[i].x = b[i].x + c[i].x;
151 a[i].y = b[i].y + c[i].y;
168 int nThreads = blockDim.x * gridDim.x;
169 int startID = blockIdx.x * blockDim.x + threadIdx.x;
170 for (
int i = startID; i < n; i +=
nThreads) {
171 a[i].x = b[i] + c[i].x;
189 int nThreads = blockDim.x * gridDim.x;
190 int startID = blockIdx.x * blockDim.x + threadIdx.x;
191 for (
int i = startID; i < n; i +=
nThreads) {
192 a[i].x = b[i].x + c[i];
210 int nThreads = blockDim.x * gridDim.x;
211 int startID = blockIdx.x * blockDim.x + threadIdx.x;
212 for (
int i = startID; i < n; i +=
nThreads) {
230 int nThreads = blockDim.x * gridDim.x;
231 int startID = blockIdx.x * blockDim.x + threadIdx.x;
232 for (
int i = startID; i < n; i +=
nThreads) {
233 a[i].x = b[i].x + c.x;
234 a[i].y = b[i].y + c.y;
251 int nThreads = blockDim.x * gridDim.x;
252 int startID = blockIdx.x * blockDim.x + threadIdx.x;
253 for (
int i = startID; i < n; i +=
nThreads) {
272 int nThreads = blockDim.x * gridDim.x;
273 int startID = blockIdx.x * blockDim.x + threadIdx.x;
274 for (
int i = startID; i < n; i +=
nThreads) {
293 int nThreads = blockDim.x * gridDim.x;
294 int startID = blockIdx.x * blockDim.x + threadIdx.x;
295 for (
int i = startID; i < n; i +=
nThreads) {
311 int nThreads = blockDim.x * gridDim.x;
312 int startID = blockIdx.x * blockDim.x + threadIdx.x;
313 for (
int i = startID; i < n; i +=
nThreads) {
314 a[i].x = b[i].x - c[i].x;
315 a[i].y = b[i].y - c[i].y;
330 int nThreads = blockDim.x * gridDim.x;
331 int startID = blockIdx.x * blockDim.x + threadIdx.x;
332 for (
int i = startID; i < n; i +=
nThreads) {
333 a[i].x = b[i] - c[i].x;
334 a[i].y = 0.0 - c[i].y;
351 int nThreads = blockDim.x * gridDim.x;
352 int startID = blockIdx.x * blockDim.x + threadIdx.x;
353 for (
int i = startID; i < n; i +=
nThreads) {
354 a[i].x = b[i].x - c[i];
372 int nThreads = blockDim.x * gridDim.x;
373 int startID = blockIdx.x * blockDim.x + threadIdx.x;
374 for (
int i = startID; i < n; i +=
nThreads) {
392 int nThreads = blockDim.x * gridDim.x;
393 int startID = blockIdx.x * blockDim.x + threadIdx.x;
394 for (
int i = startID; i < n; i +=
nThreads) {
395 a[i].x = b[i].x - c.x;
396 a[i].y = b[i].y - c.y;
412 int nThreads = blockDim.x * gridDim.x;
413 int startID = blockIdx.x * blockDim.x + threadIdx.x;
414 for (
int i = startID; i < n; i +=
nThreads) {
433 int nThreads = blockDim.x * gridDim.x;
434 int startID = blockIdx.x * blockDim.x + threadIdx.x;
435 for (
int i = startID; i < n; i +=
nThreads) {
454 int nThreads = blockDim.x * gridDim.x;
455 int startID = blockIdx.x * blockDim.x + threadIdx.x;
456 for (
int i = startID; i < n; i +=
nThreads) {
474 int nThreads = blockDim.x * gridDim.x;
475 int startID = blockIdx.x * blockDim.x + threadIdx.x;
476 for (
int i = startID; i < n; i +=
nThreads) {
477 a[i].x = (b[i].x * c[i].x) - (b[i].y * c[i].y);
478 a[i].y = (b[i].x * c[i].y) + (b[i].y * c[i].x);
494 int nThreads = blockDim.x * gridDim.x;
495 int startID = blockIdx.x * blockDim.x + threadIdx.x;
496 for (
int i = startID; i < n; i +=
nThreads) {
497 a[i].x = b[i] * c[i].x;
498 a[i].y = b[i] * c[i].y;
514 int nThreads = blockDim.x * gridDim.x;
515 int startID = blockIdx.x * blockDim.x + threadIdx.x;
516 for (
int i = startID; i < n; i +=
nThreads) {
517 a[i].x = b[i].x * c[i];
518 a[i].y = b[i].y * c[i];
533 int nThreads = blockDim.x * gridDim.x;
534 int startID = blockIdx.x * blockDim.x + threadIdx.x;
535 for (
int i = startID; i < n; i +=
nThreads) {
553 int nThreads = blockDim.x * gridDim.x;
554 int startID = blockIdx.x * blockDim.x + threadIdx.x;
555 for (
int i = startID; i < n; i +=
nThreads) {
556 a[i].x = (b[i].x * c.x) - (b[i].y * c.y);
557 a[i].y = (b[i].x * c.y) + (b[i].y * c.x);
574 int nThreads = blockDim.x * gridDim.x;
575 int startID = blockIdx.x * blockDim.x + threadIdx.x;
576 for (
int i = startID; i < n; i +=
nThreads) {
593 int nThreads = blockDim.x * gridDim.x;
594 int startID = blockIdx.x * blockDim.x + threadIdx.x;
595 for (
int i = startID; i < n; i +=
nThreads) {
612 int nThreads = blockDim.x * gridDim.x;
613 int startID = blockIdx.x * blockDim.x + threadIdx.x;
614 for (
int i = startID; i < n; i +=
nThreads) {
630 int nThreads = blockDim.x * gridDim.x;
631 int startID = blockIdx.x * blockDim.x + threadIdx.x;
632 for (
int i = startID; i < n; i +=
nThreads) {
633 a[i].x = b[i].x / c[i];
634 a[i].y = b[i].y / c[i];
649 int nThreads = blockDim.x * gridDim.x;
650 int startID = blockIdx.x * blockDim.x + threadIdx.x;
651 for (
int i = startID; i < n; i +=
nThreads) {
667 int nThreads = blockDim.x * gridDim.x;
668 int startID = blockIdx.x * blockDim.x + threadIdx.x;
669 for (
int i = startID; i < n; i +=
nThreads) {
686 int nThreads = blockDim.x * gridDim.x;
687 int startID = blockIdx.x * blockDim.x + threadIdx.x;
688 for (
int i = startID; i < n; i +=
nThreads) {
705 int nThreads = blockDim.x * gridDim.x;
706 int startID = blockIdx.x * blockDim.x + threadIdx.x;
707 for (
int i = startID; i < n; i +=
nThreads) {
722 int nThreads = blockDim.x * gridDim.x;
723 int startID = blockIdx.x * blockDim.x + threadIdx.x;
724 for (
int i = startID; i < n; i +=
nThreads) {
744 int nThreads = blockDim.x * gridDim.x;
745 int startID = blockIdx.x * blockDim.x + threadIdx.x;
746 for (
int i = startID; i < n; i +=
nThreads) {
762 int nThreads = blockDim.x * gridDim.x;
763 int startID = blockIdx.x * blockDim.x + threadIdx.x;
764 for (
int i = startID; i < n; i +=
nThreads) {
779 int nThreads = blockDim.x * gridDim.x;
780 int startID = blockIdx.x * blockDim.x + threadIdx.x;
781 for (
int i = startID; i < n; i +=
nThreads) {
797 int nThreads = blockDim.x * gridDim.x;
798 int startID = blockIdx.x * blockDim.x + threadIdx.x;
799 for (
int i = startID; i < n; i +=
nThreads) {
815 int nThreads = blockDim.x * gridDim.x;
816 int startID = blockIdx.x * blockDim.x + threadIdx.x;
817 for (
int i = startID; i < n; i +=
nThreads) {
832 int nThreads = blockDim.x * gridDim.x;
833 int startID = blockIdx.x * blockDim.x + threadIdx.x;
834 for (
int i = startID; i < n; i +=
nThreads) {
849 int nThreads = blockDim.x * gridDim.x;
850 int startID = blockIdx.x * blockDim.x + threadIdx.x;
851 for (
int i = startID; i < n; i +=
nThreads) {
867 int nThreads = blockDim.x * gridDim.x;
868 int startID = blockIdx.x * blockDim.x + threadIdx.x;
869 for (
int i = startID; i < n; i +=
nThreads) {
884 int nThreads = blockDim.x * gridDim.x;
885 int startID = blockIdx.x * blockDim.x + threadIdx.x;
886 for (
int i = startID; i < n; i +=
nThreads) {
901 int nThreads = blockDim.x * gridDim.x;
902 int startID = blockIdx.x * blockDim.x + threadIdx.x;
903 for (
int i = startID; i < n; i +=
nThreads) {
919 int nThreads = blockDim.x * gridDim.x;
920 int startID = blockIdx.x * blockDim.x + threadIdx.x;
921 for (
int i = startID; i < n; i +=
nThreads) {
936 int nThreads = blockDim.x * gridDim.x;
937 int startID = blockIdx.x * blockDim.x + threadIdx.x;
938 for (
int i = startID; i < n; i +=
nThreads) {
953 int nThreads = blockDim.x * gridDim.x;
954 int startID = blockIdx.x * blockDim.x + threadIdx.x;
956 for (
int i = startID; i < n; i +=
nThreads) {
957 c.x = (a[i].x * b[i].x) - (a[i].y * b[i].y);
958 c.y = (a[i].x * b[i].y) + (a[i].y * b[i].x);
974 int nThreads = blockDim.x * gridDim.x;
975 int startID = blockIdx.x * blockDim.x + threadIdx.x;
976 for (
int i = startID; i < n; i +=
nThreads) {
992 int nThreads = blockDim.x * gridDim.x;
993 int startID = blockIdx.x * blockDim.x + threadIdx.x;
994 for (
int i = startID; i < n; i +=
nThreads) {
1009 int nThreads = blockDim.x * gridDim.x;
1010 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1012 for (
int i = startID; i < n; i +=
nThreads) {
1013 c.x = (a[i].x * b.x) - (a[i].y * b.y);
1014 c.y = (a[i].x * b.y) + (a[i].y * b.x);
1030 int nThreads = blockDim.x * gridDim.x;
1031 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1032 for (
int i = startID; i < n; i +=
nThreads) {
1048 int nThreads = blockDim.x * gridDim.x;
1049 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1050 for (
int i = startID; i < n; i +=
nThreads) {
1065 int nThreads = blockDim.x * gridDim.x;
1066 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1067 for (
int i = startID; i < n; i +=
nThreads) {
1083 int nThreads = blockDim.x * gridDim.x;
1084 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1085 for (
int i = startID; i < n; i +=
nThreads) {
1100 int nThreads = blockDim.x * gridDim.x;
1101 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1102 for (
int i = startID; i < n; i +=
nThreads) {
1120 int nThreads = blockDim.x * gridDim.x;
1121 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1122 for (
int i = startID; i < n; i +=
nThreads) {
1137 int nThreads = blockDim.x * gridDim.x;
1138 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1139 for (
int i = startID; i < n; i +=
nThreads) {
1140 a[i].x = exp(b[i].x) * cos(b[i].y);
1141 a[i].y = exp(b[i].x) * sin(b[i].y);
1157 int nThreads = blockDim.x * gridDim.x;
1158 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1159 for (
int i = startID; i < n; i +=
nThreads) {
1174 int nThreads = blockDim.x * gridDim.x;
1175 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1177 for (
int i = startID; i < n; i +=
nThreads) {
1180 a[i].x = (bx*bx) - (by*by);
1181 a[i].y = 2.0 * bx * by;
1197 int nThreads = blockDim.x * gridDim.x;
1198 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1199 for (
int i = startID; i < n; i +=
nThreads) {
1200 a[i] = std::fabs(b[i]);
1214 int nThreads = blockDim.x * gridDim.x;
1215 int startID = blockIdx.x * blockDim.x + threadIdx.x;
1217 for (
int i = startID; i < n; i +=
nThreads) {
1220 a[i] = bx*bx + by*by;
1233 const int beginIdA,
const int beginIdB,
const int n)
1239 int nBlocks, nThreads;
1244 _eqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1246 cudaErrorCheck( cudaGetLastError() );
1250 cudaErrorCheck( cudaMemcpy(a.
cArray() + beginIdA,
1253 cudaMemcpyDeviceToDevice) );
1262 const int beginIdA,
const int beginIdB,
const int n)
1268 int nBlocks, nThreads;
1272 cudaErrorCheck( cudaMemcpy(a.
cArray() + beginIdA,
1275 cudaMemcpyDeviceToHost) );
1284 const int beginIdA,
const int beginIdB,
const int n)
1290 int nBlocks, nThreads;
1294 cudaErrorCheck( cudaMemcpy(a.
cArray() + beginIdA,
1297 cudaMemcpyHostToDevice) );
1306 const int beginIdA,
const int beginIdB,
const int n)
1312 int nBlocks, nThreads;
1316 _eqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1318 cudaErrorCheck( cudaGetLastError() );
1327 const int beginIdA,
const int beginIdB,
const int beginIdC,
1335 int nBlocks, nThreads;
1339 _eqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1342 cudaErrorCheck( cudaGetLastError() );
1350 const int beginIdA,
const int n)
1355 int nBlocks, nThreads;
1359 _eqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
1360 cudaErrorCheck( cudaGetLastError() );
1368 const int beginIdA,
const int n)
1373 int nBlocks, nThreads;
1377 _eqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
1378 cudaErrorCheck( cudaGetLastError() );
1389 const int beginIdA,
const int beginIdB,
const int beginIdC,
1397 int nBlocks, nThreads;
1401 _addVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1404 cudaErrorCheck( cudaGetLastError() );
1413 const int beginIdA,
const int beginIdB,
const int beginIdC,
1421 int nBlocks, nThreads;
1425 _addVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1428 cudaErrorCheck( cudaGetLastError() );
1437 const int beginIdA,
const int beginIdB,
const int beginIdC,
1445 int nBlocks, nThreads;
1449 _addVV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
1451 c.
cArray() + beginIdC, n);
1452 cudaErrorCheck( cudaGetLastError() );
1461 const int beginIdA,
const int beginIdB,
const int beginIdC,
1469 int nBlocks, nThreads;
1473 _addVV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
1475 c.
cArray() + beginIdC, n);
1476 cudaErrorCheck( cudaGetLastError() );
1485 const int beginIdA,
const int beginIdB,
int n)
1491 int nBlocks, nThreads;
1495 _addVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1498 cudaErrorCheck( cudaGetLastError() );
1505 const int beginIdA,
const int beginIdB,
int n)
1511 int nBlocks, nThreads;
1515 _addVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1518 cudaErrorCheck( cudaGetLastError() );
1525 const int beginIdA,
const int beginIdB,
int n)
1531 int nBlocks, nThreads;
1535 _addVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1538 cudaErrorCheck( cudaGetLastError() );
1545 const int beginIdA,
const int beginIdB,
int n)
1551 int nBlocks, nThreads;
1555 _addVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1558 cudaErrorCheck( cudaGetLastError() );
1566 const int beginIdB,
const int beginIdC,
1574 int nBlocks, nThreads;
1578 _subVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1580 cudaErrorCheck( cudaGetLastError() );
1587 const int beginIdA,
const int beginIdB,
const int beginIdC,
1595 int nBlocks, nThreads;
1599 _subVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1602 cudaErrorCheck( cudaGetLastError() );
1609 const int beginIdA,
const int beginIdB,
const int beginIdC,
1617 int nBlocks, nThreads;
1621 _subVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1624 cudaErrorCheck( cudaGetLastError() );
1631 const int beginIdA,
const int beginIdB,
const int beginIdC,
1639 int nBlocks, nThreads;
1643 _subVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1646 cudaErrorCheck( cudaGetLastError() );
1653 const int beginIdA,
const int beginIdB,
1660 int nBlocks, nThreads;
1664 _subVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1667 cudaErrorCheck( cudaGetLastError() );
1673 const cudaComplex c,
const int beginIdA,
const int beginIdB,
1680 int nBlocks, nThreads;
1684 _subVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1687 cudaErrorCheck( cudaGetLastError() );
1693 const cudaComplex c,
const int beginIdA,
const int beginIdB,
1700 int nBlocks, nThreads;
1704 _subVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1706 cudaErrorCheck( cudaGetLastError() );
1713 const int beginIdA,
const int beginIdB,
1720 int nBlocks, nThreads;
1724 _subVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1726 cudaErrorCheck( cudaGetLastError() );
1733 const int beginIdA,
const int beginIdB,
const int beginIdC,
1741 int nBlocks, nThreads;
1745 _mulVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1747 cudaErrorCheck( cudaGetLastError() );
1754 const int beginIdA,
const int beginIdB,
const int beginIdC,
1762 int nBlocks, nThreads;
1766 _mulVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1769 cudaErrorCheck( cudaGetLastError() );
1776 const int beginIdA,
const int beginIdB,
const int beginIdC,
1784 int nBlocks, nThreads;
1788 _mulVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1791 cudaErrorCheck( cudaGetLastError() );
1798 const int beginIdA,
const int beginIdB,
const int beginIdC,
1806 int nBlocks, nThreads;
1810 _mulVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1813 cudaErrorCheck( cudaGetLastError() );
1820 const int beginIdA,
const int beginIdB,
1827 int nBlocks, nThreads;
1831 _mulVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1834 cudaErrorCheck( cudaGetLastError() );
1841 const int beginIdA,
const int beginIdB,
1848 int nBlocks, nThreads;
1852 _mulVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1854 cudaErrorCheck( cudaGetLastError() );
1861 const int beginIdA,
const int beginIdB,
const int n)
1867 int nBlocks, nThreads;
1871 _mulVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1873 cudaErrorCheck( cudaGetLastError() );
1880 const int beginIdA,
const int beginIdB,
1887 int nBlocks, nThreads;
1891 _mulVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1893 cudaErrorCheck( cudaGetLastError() );
1900 const int beginIdB,
const int beginIdC,
const int n)
1907 int nBlocks, nThreads;
1911 _divVV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b.
cArray()+beginIdB,
1913 cudaErrorCheck( cudaGetLastError() );
1922 const int beginIdA,
const int beginIdB,
const int beginIdC,
1930 int nBlocks, nThreads;
1934 _divVV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
1936 c.
cArray() + beginIdC, n);
1937 cudaErrorCheck( cudaGetLastError() );
1946 const int beginIdA,
const int beginIdB,
1953 int nBlocks, nThreads;
1957 _divVS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
1960 cudaErrorCheck( cudaGetLastError() );
1969 const int beginIdA,
const int beginIdB,
1976 int nBlocks, nThreads;
1980 _divVS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
1983 cudaErrorCheck( cudaGetLastError() );
1992 const int beginIdA,
const int beginIdC,
const int n)
1998 int nBlocks, nThreads;
2002 _divSV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b,
2003 c.
cArray() + beginIdC, n);
2004 cudaErrorCheck( cudaGetLastError() );
2014 const int beginIdA,
const int beginIdB,
const int n)
2020 int nBlocks, nThreads;
2024 _addEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2026 cudaErrorCheck( cudaGetLastError() );
2034 const int beginIdA,
const int beginIdB,
const int n)
2040 int nBlocks, nThreads;
2044 _addEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2046 cudaErrorCheck( cudaGetLastError() );
2055 const int beginIdA,
const int beginIdB,
const int beginIdC,
2063 int nBlocks, nThreads;
2067 _addEqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2071 cudaErrorCheck( cudaGetLastError() );
2079 const int beginIdA,
const int beginIdB,
2086 int nBlocks, nThreads;
2090 _addEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2092 cudaErrorCheck( cudaGetLastError() );
2100 const int beginIdA,
const int n)
2105 int nBlocks, nThreads;
2109 _addEqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
2110 cudaErrorCheck( cudaGetLastError() );
2118 const int beginIdA,
const int n)
2123 int nBlocks, nThreads;
2127 _addEqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
2128 cudaErrorCheck( cudaGetLastError() );
2136 const int beginIdA,
const int n)
2141 int nBlocks, nThreads;
2145 _addEqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
2146 cudaErrorCheck( cudaGetLastError() );
2154 const int beginIdA,
const int beginIdB,
const int n)
2160 int nBlocks, nThreads;
2164 _subEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2166 cudaErrorCheck( cudaGetLastError() );
2174 const int beginIdA,
const int beginIdB,
const int n)
2180 int nBlocks, nThreads;
2184 _subEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2186 cudaErrorCheck( cudaGetLastError() );
2194 const int beginIdA,
const int beginIdB,
const int n)
2200 int nBlocks, nThreads;
2204 _subEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2206 cudaErrorCheck( cudaGetLastError() );
2214 const int beginIdA,
const int n)
2219 int nBlocks, nThreads;
2223 _subEqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
2224 cudaErrorCheck( cudaGetLastError() );
2232 const int beginIdA,
const int n)
2237 int nBlocks, nThreads;
2241 _subEqS<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA, b, n);
2242 cudaErrorCheck( cudaGetLastError() );
2250 const int beginIdA,
const int n)
2255 int nBlocks, nThreads;
2259 _subEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2260 cudaErrorCheck( cudaGetLastError() );
2268 const int beginIdA,
const int beginIdB,
const int n)
2274 int nBlocks, nThreads;
2278 _mulEqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2279 b.
cArray() + beginIdB, n);
2280 cudaErrorCheck( cudaGetLastError() );
2286 const int beginIdA,
const int beginIdB,
const int n)
2292 int nBlocks, nThreads;
2296 _mulEqV<<<nBlocks, nThreads>>>(a.
cArray()+beginIdA,
2298 cudaErrorCheck( cudaGetLastError() );
2304 const int beginIdA,
const int beginIdB,
const int n)
2310 int nBlocks, nThreads;
2314 _mulEqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2315 b.
cArray() + beginIdB, n);
2316 cudaErrorCheck( cudaGetLastError() );
2322 const int beginIdA,
const int n)
2327 int nBlocks, nThreads;
2331 _mulEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2332 cudaErrorCheck( cudaGetLastError() );
2338 const int beginIdA,
const int n)
2343 int nBlocks, nThreads;
2347 _mulEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2348 cudaErrorCheck( cudaGetLastError() );
2356 const int beginIdA,
const int n)
2361 int nBlocks, nThreads;
2365 _mulEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2366 cudaErrorCheck( cudaGetLastError() );
2376 const int beginIdA,
const int beginIdB,
const int n)
2382 int nBlocks, nThreads;
2386 _divEqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2387 b.
cArray() + beginIdB, n);
2388 cudaErrorCheck( cudaGetLastError() );
2396 const int beginIdA,
const int beginIdB,
const int n)
2402 int nBlocks, nThreads;
2406 _divEqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2407 b.
cArray() + beginIdB, n);
2408 cudaErrorCheck( cudaGetLastError() );
2416 const int beginIdA,
const int n)
2421 int nBlocks, nThreads;
2425 _divEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2426 cudaErrorCheck( cudaGetLastError() );
2434 const int beginIdA,
const int n)
2439 int nBlocks, nThreads;
2443 _divEqS<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA, b, n);
2444 cudaErrorCheck( cudaGetLastError() );
2454 const int beginIdA,
const int beginIdB,
const int n)
2460 int nBlocks, nThreads;
2464 _expV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2465 b.
cArray() + beginIdB, n);
2466 cudaErrorCheck( cudaGetLastError() );
2474 const int beginIdA,
const int beginIdB,
const int n)
2480 int nBlocks, nThreads;
2484 _expV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2485 b.
cArray() + beginIdB, n);
2486 cudaErrorCheck( cudaGetLastError() );
2496 const int beginIdA,
const int beginIdB,
2503 int nBlocks, nThreads;
2507 _sqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2508 b.
cArray() + beginIdB, n);
2509 cudaErrorCheck( cudaGetLastError() );
2517 const int beginIdA,
const int beginIdB,
2524 int nBlocks, nThreads;
2528 _sqV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2529 b.
cArray() + beginIdB, n);
2530 cudaErrorCheck( cudaGetLastError() );
2540 const int beginIdA,
const int beginIdB,
2547 int nBlocks, nThreads;
2551 _absV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2552 b.
cArray() + beginIdB, n);
2553 cudaErrorCheck( cudaGetLastError() );
2561 const int beginIdA,
const int beginIdB,
2568 int nBlocks, nThreads;
2572 _sqAbsV<<<nBlocks, nThreads>>>(a.
cArray() + beginIdA,
2573 b.
cArray() + beginIdB, n);
2574 cudaErrorCheck( cudaGetLastError() );
Dynamic array on the GPU device with aligned data.
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Array container class template.
Data * cArray()
Return a pointer to the underlying C array.
int capacity() const
Return allocated size.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
void divEqS(Array< double > &a, double b)
Vector-scalar in-place division, a[i] /= b.
void addEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place addition, a[i] += b[i] (real).
void divEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place division, a[i] /= b[i].
void addEqS(Array< double > &a, double b)
Vector-scalar in-place addition, a[i] += b (real).
void sqV(Array< double > &a, Array< double > const &b)
Vector element-wise square, a[i] = b[i]*b[i] (real).
void mulEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place multiplication, a[i] *= b[i] (real).
void eqV(Array< double > &a, Array< double > const &b, const int beginIdA, const int beginIdB, const int n)
Vector assignment, a[i] = b[i] (real, slice).
void sqAbsV(Array< double > &a, Array< fftw_complex > const &b)
Square of absolute magnitude, a[i] = |b[i]|^2 (complex).
void mulEqS(Array< double > &a, double b)
Vector-scalar in-place multiplication, a[i] *= b (real).
void subVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector subtraction, a[i] = b[i] - c[i] (real)
void absV(Array< double > &a, Array< double > const &b)
Element-wise absolute magnitude, a[i] = abs(b[i]) (real).
void expV(Array< double > &a, Array< double > const &b)
Vector exponentiation, a[i] = exp(b[i]) (real).
void divVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar division, a[i] = b[i] / c (real).
void eqS(Array< double > &a, double b)
Vector assignment, a[i] = b (real).
void mulVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector multiplication, a[i] = b[i] * c[i] (real).
void subVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar subtraction, a[i] = b[i] - c (real).
void subEqV(Array< double > &a, Array< double > const &b)
Vector-vector in-place subtraction, a[i] -= b[i] (real).
void addVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector addition, a[i] = b[i] + c[i] (real)
void divSV(Array< double > &a, double b, Array< double > const &c)
Vector division, a[i] = b / c[i].
void addVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar addition, a[i] = b[i] + c (real).
void divVV(Array< double > &a, Array< double > const &b, Array< double > const &c)
Vector-vector division, a[i] = b[i] / c[i] (real).
void subEqS(Array< double > &a, double b)
Vector-scalar subtraction in-place, a[i] -= b (real).
void mulVS(Array< double > &a, Array< double > const &b, double c)
Vector-scalar multiplication, a[i] = b[i] * c (real).
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
Vector operations on GPU or CPU.
PSCF package top-level namespace.
cufftDoubleComplex cudaComplex
Complex number type used in CPU code that uses FFTW.
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.