1#ifndef PSPG_PARALLEL_REDUCTIONS_CU
2#define PSPG_PARALLEL_REDUCTIONS_CU
4#include "ParallelReductions.h"
9__global__
void reductionSum(cudaReal* sum,
const cudaReal* in,
int size)
12 int tid = threadIdx.x;
14 int bdim = blockDim.x;
15 int idx = bid * (blockDim.x*2) + tid;
19 extern __shared__ cudaReal sdata[];
22 cudaReal in0 = in[idx];
23 if (idx + bdim < size) {
24 cudaReal in1 = in[idx+bdim];
36 for (
int stride = blockDim.x/2; stride > 0; stride/=2) {
37 if (tid < stride && idx+stride < size) {
38 sdata[tid] += sdata[tid+stride];
49__global__
void reductionInnerProduct(cudaReal* innerprod,
const cudaReal* a,
const cudaReal* b,
int size)
52 int tid = threadIdx.x;
54 int bdim = blockDim.x;
55 int idx = bid * bdim*2 + tid;
59 extern __shared__ cudaReal sdata[];
62 cudaReal in0 = a[idx]*b[idx];
63 if (idx + bdim < size) {
64 cudaReal in1 = a[idx+bdim]*b[idx+bdim];
65 sdata[tid] = in0 + in1;
77 for (
int stride = blockDim.x/2; stride > 0; stride/=2) {
78 if (tid < stride && idx+stride < size) {
79 sdata[tid] += sdata[tid+stride];
86 innerprod[bid] = sdata[0];
91__global__
void reductionMax(cudaReal* max,
const cudaReal* in,
int size)
94 int tid = threadIdx.x;
96 int bdim = blockDim.x;
97 int idx = bid * (blockDim.x*2) + tid;
101 extern __shared__ cudaReal sdata[];
104 cudaReal in0 = in[idx];
105 if (idx + bdim < size) {
106 cudaReal in1 = in[idx+bdim];
107 sdata[tid] = (in0 > in1) ? in0 : in1;
118 for (
int stride = blockDim.x/2; stride > 0; stride/=2) {
119 if (tid < stride && idx+stride < size) {
120 if (sdata[tid+stride] > sdata[tid]) {
121 sdata[tid] = sdata[tid+stride];
133__global__
void reductionMaxAbs(cudaReal* max,
const cudaReal* in,
int size)
136 int tid = threadIdx.x;
137 int bid = blockIdx.x;
138 int bdim = blockDim.x;
139 int idx = bid * (blockDim.x*2) + tid;
143 extern __shared__ cudaReal sdata[];
146 cudaReal in0 = fabs(in[idx]);
147 if (idx + bdim < size) {
148 cudaReal in1 = fabs(in[idx+bdim]);
149 sdata[tid] = (in0 > in1) ? in0 : in1;
159 for (
int stride = blockDim.x / 2; stride > 0; stride /= 2) {
160 if (tid < stride && idx+stride < size) {
161 if (sdata[tid+stride] > sdata[tid]) {
162 sdata[tid] = sdata[tid+stride];
174__global__
void reductionMin(cudaReal* min,
const cudaReal* in,
int size)
177 int tid = threadIdx.x;
178 int bid = blockIdx.x;
179 int bdim = blockDim.x;
180 int idx = bid * (blockDim.x*2) + tid;
184 extern __shared__ cudaReal sdata[];
187 cudaReal in0 = in[idx];
188 if (idx + bdim < size) {
189 cudaReal in1 = in[idx+bdim];
190 sdata[tid] = (in0 < in1) ? in0 : in1;
201 for (
int stride = blockDim.x/2; stride > 0; stride/=2) {
202 if (tid < stride && idx+stride < size) {
203 if (sdata[tid+stride] < sdata[tid]) {
204 sdata[tid] = sdata[tid+stride];
217__global__
void reductionMinAbs(cudaReal* min,
const cudaReal* in,
int size)
220 int tid = threadIdx.x;
221 int bid = blockIdx.x;
222 int bdim = blockDim.x;
223 int idx = bid * (blockDim.x*2) + tid;
227 extern __shared__ cudaReal sdata[];
230 cudaReal in0 = fabs(in[idx]);
231 if (idx + bdim < size) {
232 cudaReal in1 = fabs(in[idx+bdim]);
233 sdata[tid] = (in0 < in1) ? in0 : in1;
244 for (
int stride = blockDim.x/2; stride > 0; stride/=2) {
245 if (tid < stride && idx+stride < size) {
246 if (sdata[tid+stride] < sdata[tid]) {
247 sdata[tid] = sdata[tid+stride];
C++ namespace for polymer self-consistent field theory (PSCF).