9#include <pscf/cuda/ThreadArray.h>
10#include <pscf/cuda/HostDArray.h>
11#include <pscf/cuda/cudaErrorCheck.h>
64 __device__
void _warpSum(
volatile cudaReal* sData,
int tId)
66 sData[tId] += sData[tId + 32];
67 sData[tId] += sData[tId + 16];
68 sData[tId] += sData[tId + 8];
69 sData[tId] += sData[tId + 4];
70 sData[tId] += sData[tId + 2];
71 sData[tId] += sData[tId + 1];
80 __device__
void _warpMax(
volatile cudaReal* sData,
int tId)
82 if (sData[tId + 32] > sData[tId]) sData[tId] = sData[tId + 32];
83 if (sData[tId + 16] > sData[tId]) sData[tId] = sData[tId + 16];
84 if (sData[tId + 8] > sData[tId]) sData[tId] = sData[tId + 8];
85 if (sData[tId + 4] > sData[tId]) sData[tId] = sData[tId + 4];
86 if (sData[tId + 2] > sData[tId]) sData[tId] = sData[tId + 2];
87 if (sData[tId + 1] > sData[tId]) sData[tId] = sData[tId + 1];
96 __device__
void _warpMin(
volatile cudaReal* sData,
int tId)
98 if (sData[tId + 32] < sData[tId]) sData[tId] = sData[tId + 32];
99 if (sData[tId + 16] < sData[tId]) sData[tId] = sData[tId + 16];
100 if (sData[tId + 8] < sData[tId]) sData[tId] = sData[tId + 8];
101 if (sData[tId + 4] < sData[tId]) sData[tId] = sData[tId + 4];
102 if (sData[tId + 2] < sData[tId]) sData[tId] = sData[tId + 2];
103 if (sData[tId + 1] < sData[tId]) sData[tId] = sData[tId + 1];
121 __global__
void _sum(cudaReal* sum,
const cudaReal* in,
int n)
124 int tId = threadIdx.x;
125 int bId = blockIdx.x;
126 int bDim = blockDim.x;
127 int idx = bId * (bDim*2) + tId;
130 extern __shared__ cudaReal sData[];
134 sData[tId] = in[idx];
135 if (idx + bDim < n) {
136 sData[tId] += in[idx+bDim];
150 for (
int stride = bDim / 2; stride > 32; stride /= 2) {
152 sData[tId] += sData[tId+stride];
159 _warpSum(sData, tId);
179 __global__
void _max(cudaReal* max,
const cudaReal* in,
int n)
182 int tId = threadIdx.x;
183 int bId = blockIdx.x;
184 int bDim = blockDim.x;
185 int idx = bId * (bDim*2) + tId;
188 extern __shared__ cudaReal sData[];
192 sData[tId] = in[idx];
193 if (idx + bDim < n) {
194 cudaReal in1 = in[idx+bDim];
195 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
200 sData[tId] = in[idx-n];
209 for (
int stride = bDim/2; stride > 32; stride/=2) {
211 if (sData[tId+stride] > sData[tId]) {
212 sData[tId] = sData[tId+stride];
220 _warpMax(sData, tId);
240 __global__
void _maxAbs(cudaReal* max,
const cudaReal* in,
int n)
243 int tId = threadIdx.x;
244 int bId = blockIdx.x;
245 int bDim = blockDim.x;
246 int idx = bId * (bDim*2) + tId;
249 extern __shared__ cudaReal sData[];
253 sData[tId] = fabs(in[idx]);
254 if (idx + bDim < n) {
255 cudaReal in1 = fabs(in[idx+bDim]);
256 sData[tId] = (sData[tId] > in1) ? sData[tId] : in1;
270 for (
int stride = bDim/2; stride > 32; stride/=2) {
272 if (sData[tId+stride] > sData[tId]) {
273 sData[tId] = sData[tId+stride];
281 _warpMax(sData, tId);
301 __global__
void _min(cudaReal* min,
const cudaReal* in,
int n)
304 int tId = threadIdx.x;
305 int bId = blockIdx.x;
306 int bDim = blockDim.x;
307 int idx = bId * (bDim*2) + tId;
310 extern __shared__ cudaReal sData[];
314 sData[tId] = in[idx];
315 if (idx + bDim < n) {
316 cudaReal in1 = in[idx+bDim];
317 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
322 sData[tId] = in[idx-n];
331 for (
int stride = bDim/2; stride > 32; stride/=2) {
333 if (sData[tId+stride] < sData[tId]) {
334 sData[tId] = sData[tId+stride];
342 _warpMin(sData, tId);
362 __global__
void _minAbs(cudaReal* min,
const cudaReal* in,
int n)
365 int tId = threadIdx.x;
366 int bId = blockIdx.x;
367 int bDim = blockDim.x;
368 int idx = bId * (bDim*2) + tId;
371 extern __shared__ cudaReal sData[];
375 sData[tId] = fabs(in[idx]);
376 if (idx + bDim < n) {
377 cudaReal in1 = fabs(in[idx+bDim]);
378 sData[tId] = (sData[tId] < in1) ? sData[tId] : in1;
383 sData[tId] = fabs(in[idx-n]);
392 for (
int stride = bDim/2; stride > 32; stride/=2) {
394 if (sData[tId+stride] < sData[tId]) {
395 sData[tId] = sData[tId+stride];
403 _warpMin(sData, tId);
424 __global__
void _innerProduct(cudaReal* ip,
const cudaReal* a,
425 const cudaReal* b,
int n)
428 int tId = threadIdx.x;
429 int bId = blockIdx.x;
430 int bDim = blockDim.x;
431 int idx = bId * (bDim*2) + tId;
434 extern __shared__ cudaReal sData[];
438 sData[tId] = a[idx] * b[idx];
439 if (idx + bDim < n) {
440 sData[tId] += (a[idx+bDim] * b[idx+bDim]);
454 for (
int stride = bDim / 2; stride > 32; stride /= 2) {
456 sData[tId] += sData[tId+stride];
463 _warpSum(sData, tId);
495 int nBlocks, nThreads;
496 int halvedSize = ceil((
float)n/2);
508 <<
"nThreads too small for parallel reduction.\n"
509 <<
"Setting nThreads equal to 64." << std::endl;
518 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
520 cudaErrorCheck( cudaGetLastError() );
521 }
else if (i % 2 == 1) {
523 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
525 cudaErrorCheck( cudaGetLastError() );
529 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
531 cudaErrorCheck( cudaGetLastError() );
543 }
else if (i % 2 == 1) {
554 cudaReal sum = 0.0, tempVal, tempSum;
556 for (
int i = 0; i < n; ++i) {
557 tempVal = temp_h[i] - err;
558 tempSum = sum + tempVal;
559 err = tempSum - sum - tempVal;
584 int nBlocks, nThreads;
585 int halvedSize = ceil((
float)n/2);
597 <<
"nThreads too small for parallel reduction.\n"
598 <<
"Setting nThreads equal to 64." << std::endl;
607 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
609 cudaErrorCheck( cudaGetLastError() );
610 }
else if (i % 2 == 1) {
612 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
614 cudaErrorCheck( cudaGetLastError() );
618 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
620 cudaErrorCheck( cudaGetLastError() );
632 }
else if (i % 2 == 1) {
638 cudaReal max = temp_h[0];
639 for (
int i = 1; i < n; i++) {
640 if (temp_h[i] > max) max = temp_h[i];
662 while (n >= 1e5 || i == 0) {
666 int nBlocks, nThreads;
667 int halvedSize = ceil((
float)n/2);
679 <<
"nThreads too small for parallel reduction.\n"
680 <<
"Setting nThreads equal to 64." << std::endl;
689 _maxAbs<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
691 cudaErrorCheck( cudaGetLastError() );
692 }
else if (i % 2 == 1) {
694 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
696 cudaErrorCheck( cudaGetLastError() );
700 _max<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
702 cudaErrorCheck( cudaGetLastError() );
714 }
else if (i % 2 == 1) {
720 cudaReal max = temp_h[0];
721 for (
int i = 1; i < n; i++) {
722 if (temp_h[i] > max) max = temp_h[i];
745 int nBlocks, nThreads;
746 int halvedSize = ceil((
float)n/2);
758 <<
"nThreads too small for parallel reduction.\n"
759 <<
"Setting nThreads equal to 64." << std::endl;
768 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
770 cudaErrorCheck( cudaGetLastError() );
771 }
else if (i % 2 == 1) {
773 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
775 cudaErrorCheck( cudaGetLastError() );
779 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
781 cudaErrorCheck( cudaGetLastError() );
793 }
else if (i % 2 == 1) {
799 cudaReal min = temp_h[0];
800 for (
int i = 1; i < n; i++) {
801 if (temp_h[i] < min) min = temp_h[i];
823 while (n >= 1e5 || i == 0) {
827 int nBlocks, nThreads;
828 int halvedSize = ceil((
float)n/2);
840 <<
"nThreads too small for parallel reduction.\n"
841 <<
"Setting nThreads equal to 64." << std::endl;
850 _minAbs<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
852 cudaErrorCheck( cudaGetLastError() );
853 }
else if (i % 2 == 1) {
855 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
857 cudaErrorCheck( cudaGetLastError() );
861 _min<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
863 cudaErrorCheck( cudaGetLastError() );
875 }
else if (i % 2 == 1) {
881 cudaReal min = temp_h[0];
882 for (
int i = 1; i < n; i++) {
883 if (temp_h[i] < min) min = temp_h[i];
908 while (n >= 1e5 || i == 0) {
912 int nBlocks, nThreads;
913 int halvedSize = ceil((
float)n/2);
925 <<
"nThreads too small for parallel reduction.\n"
926 <<
"Setting nThreads equal to 64." << std::endl;
939 _innerProduct<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
941 cudaErrorCheck( cudaGetLastError() );
942 }
else if (i % 2 == 1) {
944 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
946 cudaErrorCheck( cudaGetLastError() );
950 _sum<<<nBlocks, nThreads, nThreads*
sizeof(cudaReal)>>>
952 cudaErrorCheck( cudaGetLastError() );
973 cudaReal sum = 0.0, tempVal, tempSum;
975 for (
int i = 0; i < n; ++i) {
976 tempVal = temp_h[i] - err;
977 tempSum = sum + tempVal;
978 err = tempSum - sum - tempVal;
Dynamic array on the GPU device with aligned data.
void deallocate()
Dellocate the underlying C array.
bool isAllocated() const
Return true if the array has been allocated, false otherwise.
int capacity() const
Return allocated capacity.
void allocate(int capacity)
Allocate the underlying C array on the device.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
static std::ostream & file()
Get log ostream by reference.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
int warpSize()
Get the warp size.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
Functions that perform parallel reductions on the GPU.
Fields, FFTs, and utilities for periodic boundary conditions (CUDA)
Periodic fields and crystallography.
PSCF package top-level namespace.