64 __device__
void _warpSum(
volatile cudaReal* sData,
int tId)
66 sData[tId] += sData[tId + 32];
67 sData[tId] += sData[tId + 16];
68 sData[tId] += sData[tId + 8];
69 sData[tId] += sData[tId + 4];
70 sData[tId] += sData[tId + 2];
71 sData[tId] += sData[tId + 1];
80 __device__
void _warpMax(
volatile cudaReal* sData,
int tId)
82 if (sData[tId + 32] > sData[tId]) sData[tId] = sData[tId + 32];
83 if (sData[tId + 16] > sData[tId]) sData[tId] = sData[tId + 16];
84 if (sData[tId + 8] > sData[tId]) sData[tId] = sData[tId + 8];
85 if (sData[tId + 4] > sData[tId]) sData[tId] = sData[tId + 4];
86 if (sData[tId + 2] > sData[tId]) sData[tId] = sData[tId + 2];
87 if (sData[tId + 1] > sData[tId]) sData[tId] = sData[tId + 1];
96 __device__
void _warpMin(
volatile cudaReal* sData,
int tId)
98 if (sData[tId + 32] < sData[tId]) sData[tId] = sData[tId + 32];
99 if (sData[tId + 16] < sData[tId]) sData[tId] = sData[tId + 16];
100 if (sData[tId + 8] < sData[tId]) sData[tId] = sData[tId + 8];
101 if (sData[tId + 4] < sData[tId]) sData[tId] = sData[tId + 4];
102 if (sData[tId + 2] < sData[tId]) sData[tId] = sData[tId + 2];
103 if (sData[tId + 1] < sData[tId]) sData[tId] = sData[tId + 1];
121 __global__
void _sum(cudaReal*
sum,
const cudaReal* in,
int n)
124 int tId = threadIdx.x;
125 int bId = blockIdx.x;
126 int bDim = blockDim.x;
127 int idx = bId * (bDim*2) + tId;
130 extern __shared__ cudaReal sData[];
134 sData[tId] = in[idx];
135 if (idx + bDim < n) {
136 sData[tId] += in[idx+bDim];
150 for (
int stride = bDim / 2; stride > 32; stride /= 2) {
152 sData[tId] += sData[tId+stride];
159 _warpSum(sData, tId);
179 __global__
void _max(cudaReal*
max,
const cudaReal* in,
int n)
182 int tId = threadIdx.x;
183 int bId = blockIdx.x;
184 int bDim = blockDim.x;
185 int idx = bId * (bDim*2) + tId;
188 extern __shared__ cudaReal sData[];
192 sData[tId] = in[idx];
193 if (idx + bDim < n) {
194 cudaReal in1 = in[idx+bDim];
195 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
200 sData[tId] = in[idx-n];
209 for (
int stride = bDim/2; stride > 32; stride/=2) {
211 if (sData[tId+stride] > sData[tId]) {
212 sData[tId] = sData[tId+stride];
220 _warpMax(sData, tId);
240 __global__
void _maxAbs(cudaReal*
max,
const cudaReal* in,
int n)
243 int tId = threadIdx.x;
244 int bId = blockIdx.x;
245 int bDim = blockDim.x;
246 int idx = bId * (bDim*2) + tId;
249 extern __shared__ cudaReal sData[];
253 sData[tId] = fabs(in[idx]);
254 if (idx + bDim < n) {
255 cudaReal in1 = fabs(in[idx+bDim]);
256 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
270 for (
int stride = bDim/2; stride > 32; stride/=2) {
272 if (sData[tId+stride] > sData[tId]) {
273 sData[tId] = sData[tId+stride];
281 _warpMax(sData, tId);
301 __global__
void _min(cudaReal*
min,
const cudaReal* in,
int n)
304 int tId = threadIdx.x;
305 int bId = blockIdx.x;
306 int bDim = blockDim.x;
307 int idx = bId * (bDim*2) + tId;
310 extern __shared__ cudaReal sData[];
314 sData[tId] = in[idx];
315 if (idx + bDim < n) {
316 cudaReal in1 = in[idx+bDim];
317 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
322 sData[tId] = in[idx-n];
331 for (
int stride = bDim/2; stride > 32; stride/=2) {
333 if (sData[tId+stride] < sData[tId]) {
334 sData[tId] = sData[tId+stride];
342 _warpMin(sData, tId);
362 __global__
void _minAbs(cudaReal*
min,
const cudaReal* in,
int n)
365 int tId = threadIdx.x;
366 int bId = blockIdx.x;
367 int bDim = blockDim.x;
368 int idx = bId * (bDim*2) + tId;
371 extern __shared__ cudaReal sData[];
375 sData[tId] = fabs(in[idx]);
376 if (idx + bDim < n) {
377 cudaReal in1 = fabs(in[idx+bDim]);
378 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
383 sData[tId] = fabs(in[idx-n]);
392 for (
int stride = bDim/2; stride > 32; stride/=2) {
394 if (sData[tId+stride] < sData[tId]) {
395 sData[tId] = sData[tId+stride];
403 _warpMin(sData, tId);
424 __global__
void _innerProduct(cudaReal* ip,
const cudaReal* a,
425 const cudaReal* b,
int n)
428 int tId = threadIdx.x;
429 int bId = blockIdx.x;
430 int bDim = blockDim.x;
431 int idx = bId * (bDim*2) + tId;
434 extern __shared__ cudaReal sData[];
438 sData[tId] = a[idx] * b[idx];
439 if (idx + bDim < n) {
440 sData[tId] += (a[idx+bDim] * b[idx+bDim]);
454 for (
int stride = bDim / 2; stride > 32; stride /= 2) {
456 sData[tId] += sData[tId+stride];
463 _warpSum(sData, tId);
495 int nBlocks, nThreads;
496 int halvedSize = ceil((
float)n/2);
508 <<
"nThreads too small for parallel reduction.\n"
509 <<
"Setting nThreads equal to 64." << std::endl;
518 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
520 cudaErrorCheck( cudaGetLastError() );
521 }
else if (i % 2 == 1) {
523 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
525 cudaErrorCheck( cudaGetLastError() );
529 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
531 cudaErrorCheck( cudaGetLastError() );
543 }
else if (i % 2 == 1) {
554 cudaReal
sum = 0.0, tempVal, tempSum;
556 for (
int i = 0; i < n; ++i) {
557 tempVal = temp_h[i] - err;
558 tempSum =
sum + tempVal;
559 err = tempSum -
sum - tempVal;
584 int nBlocks, nThreads;
585 int halvedSize = ceil((
float)n/2);
597 <<
"nThreads too small for parallel reduction.\n"
598 <<
"Setting nThreads equal to 64." << std::endl;
607 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
609 cudaErrorCheck( cudaGetLastError() );
610 }
else if (i % 2 == 1) {
612 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
614 cudaErrorCheck( cudaGetLastError() );
618 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
620 cudaErrorCheck( cudaGetLastError() );
632 }
else if (i % 2 == 1) {
638 cudaReal
max = temp_h[0];
639 for (
int i = 1; i < n; i++) {
640 if (temp_h[i] >
max)
max = temp_h[i];
662 while (n >= 1e5 || i == 0) {
666 int nBlocks, nThreads;
667 int halvedSize = ceil((
float)n/2);
679 <<
"nThreads too small for parallel reduction.\n"
680 <<
"Setting nThreads equal to 64." << std::endl;
689 _maxAbs<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
691 cudaErrorCheck( cudaGetLastError() );
692 }
else if (i % 2 == 1) {
694 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
696 cudaErrorCheck( cudaGetLastError() );
700 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
702 cudaErrorCheck( cudaGetLastError() );
714 }
else if (i % 2 == 1) {
720 cudaReal
max = temp_h[0];
721 for (
int i = 1; i < n; i++) {
722 if (temp_h[i] >
max)
max = temp_h[i];
745 int nBlocks, nThreads;
746 int halvedSize = ceil((
float)n/2);
758 <<
"nThreads too small for parallel reduction.\n"
759 <<
"Setting nThreads equal to 64." << std::endl;
768 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
770 cudaErrorCheck( cudaGetLastError() );
771 }
else if (i % 2 == 1) {
773 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
775 cudaErrorCheck( cudaGetLastError() );
779 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
781 cudaErrorCheck( cudaGetLastError() );
793 }
else if (i % 2 == 1) {
799 cudaReal
min = temp_h[0];
800 for (
int i = 1; i < n; i++) {
801 if (temp_h[i] <
min)
min = temp_h[i];
823 while (n >= 1e5 || i == 0) {
827 int nBlocks, nThreads;
828 int halvedSize = ceil((
float)n/2);
840 <<
"nThreads too small for parallel reduction.\n"
841 <<
"Setting nThreads equal to 64." << std::endl;
850 _minAbs<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
852 cudaErrorCheck( cudaGetLastError() );
853 }
else if (i % 2 == 1) {
855 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
857 cudaErrorCheck( cudaGetLastError() );
861 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
863 cudaErrorCheck( cudaGetLastError() );
875 }
else if (i % 2 == 1) {
881 cudaReal
min = temp_h[0];
882 for (
int i = 1; i < n; i++) {
883 if (temp_h[i] <
min)
min = temp_h[i];
908 while (n >= 1e5 || i == 0) {
912 int nBlocks, nThreads;
913 int halvedSize = ceil((
float)n/2);
925 <<
"nThreads too small for parallel reduction.\n"
926 <<
"Setting nThreads equal to 64." << std::endl;
939 _innerProduct<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
941 cudaErrorCheck( cudaGetLastError() );
942 }
else if (i % 2 == 1) {
944 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
946 cudaErrorCheck( cudaGetLastError() );
950 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
952 cudaErrorCheck( cudaGetLastError() );
973 cudaReal
sum = 0.0, tempVal, tempSum;
975 for (
int i = 0; i < n; ++i) {
976 tempVal = temp_h[i] - err;
977 tempSum =
sum + tempVal;
978 err = tempSum -
sum - tempVal;