PSCF v1.1
KernelWrappers.cu
1#ifndef PSPG_KERNEL_WRAPPERS_CU
2#define PSPG_KERNEL_WRAPPERS_CU
3
4#include "KernelWrappers.h"
5#include "ThreadGrid.h"
6
7namespace Pscf {
8namespace Pspg {
9
10__host__ cudaReal gpuSum(cudaReal const * d_in, int size)
11{
12
13 // Establish GPU resources for this parallel reduction. Divided by two because
14 // of the global memory load in the kernel performing the first level of reduction!
15 int nBlocks, nThreads;
16 int halvedsize = ceil((float)size/2);
17
18 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
19
20 // Set up temporary, small device and host arrays for storing reduced data
21 // temp_ will handle excess and reduced and sum them up
22 cudaReal* temp_ = new cudaReal[nBlocks];
23 cudaReal* d_temp_;
24 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
25
26 // Perform parallel reduction sum operation
27 reductionSum<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_in, size);
28
29 // Load the partially reduced sum result to temp_
30 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
31
32 // Perform final sum on CPU using kahan summation to reduce accumulation of error
33 cudaReal sum = 0, tempVal, tempSum;
34 cudaReal err = 0;
35 for (int i = 0; i < nBlocks; ++i) {
36 tempVal = temp_[i] - err;
37 tempSum = sum + tempVal;
38 err = tempSum - sum - tempVal;
39 sum = tempSum;
40 }
41
42 return sum;
43}
44
45__host__ cudaReal gpuInnerProduct(cudaReal const * d_a, cudaReal const * d_b, int size)
46{
47
48 // Establish GPU resources for this parallel reduction. Divided by two because
49 // of the global memory load in the kernel performing the first level of reduction!
50 int nBlocks, nThreads;
51 int halvedsize = ceil((float)size/2);
52
53 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
54
55 // Set up temporary, small device and host arrays for storing reduced data
56 // temp_ will handle excess and reduced and sum them up
57 cudaReal* temp_ = new cudaReal[nBlocks];
58 cudaReal* d_temp_;
59 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
60
61 // Perform parallel reduction inner product operation
62 reductionInnerProduct<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_a, d_b, size);
63
64 // Load the partially reduced inner product result to temp_
65 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
66
67 // Perform final sum on CPU using kahan summation to reduce accumulation of error
68 cudaReal sum = 0, tempVal, tempSum;
69 cudaReal err = 0;
70 for (int i = 0; i < nBlocks; ++i) {
71 tempVal = temp_[i] - err;
72 tempSum = sum + tempVal;
73 err = tempSum - sum - tempVal;
74 sum = tempSum;
75 }
76
77 return sum;
78}
79
80__host__ cudaReal gpuMax(cudaReal const * d_in, int size)
81{
82 // Establish GPU resources for this parallel reduction. Divided by two because
83 // of the global memory load in the kernel performing the first level of reduction!
84 int nBlocks, nThreads;
85 int halvedsize = ceil((float)size/2);
86
87 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
88
89 // Set up temporary, small device and host arrays for storing reduced data
90 // temp_ will handle excess and reduced and sum them up
91 cudaReal* temp_ = new cudaReal[nBlocks];
92 cudaReal* d_temp_;
93 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
94
95 // Perform parallel reduction maximum operation
96 reductionMax<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_in, size);
97
98 // Load the partially reduced maximum result to temp_
99 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
100
101 // Perform final comparison on CPU
102 cudaReal max = 0;
103 for (int i = 0; i < nBlocks; i++) {
104 if (temp_[i] > max) max = temp_[i];
105 }
106
107 return max;
108}
109
110__host__ cudaReal gpuMaxAbs(cudaReal const * d_in, int size)
111{
112 // Establish GPU resources for this parallel reduction. Divided by two because
113 // of the global memory load in the kernel performing the first level of reduction!
114 int nBlocks, nThreads;
115 int halvedsize = ceil((float)size/2);
116
117 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
118
119 // Set up temporary, small device and host arrays for storing reduced data
120 // temp_ will handle excess and reduced and sum them up
121 cudaReal* temp_ = new cudaReal[nBlocks];
122 cudaReal* d_temp_;
123 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
124
125 // Perform parallel reduction maximum of the absolute value
126 reductionMaxAbs<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_in, size);
127
128 // Load the partially reduced maximum of the absolute value result to temp_
129 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
130
131 // Perform final comparison on CPU. Absolute values already taken in kernel, so
132 // no need to take them here.
133 cudaReal max = 0;
134 for (int i = 0; i < nBlocks; i++) {
135 if (temp_[i] > max) max = temp_[i];
136 }
137
138 return max;
139}
140
141__host__ cudaReal gpuMin(cudaReal const * d_in, int size)
142{
143 // Establish GPU resources for this parallel reduction. Divided by two because
144 // of the global memory load in the kernel performing the first level of reduction!
145 int nBlocks, nThreads;
146 int halvedsize = ceil((float)size/2);
147
148 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
149
150 // Set up temporary, small device and host arrays for storing reduced data
151 // temp_ will handle excess and reduced and sum them up
152 cudaReal* temp_ = new cudaReal[nBlocks];
153 cudaReal* d_temp_;
154 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
155
156 // Perform parallel reduction minimum operation
157 reductionMin<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_in, size);
158
159 // Load the partially reduced minimum result to temp_
160 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
161
162 // Perform final comparison on CPU
163 cudaReal min = temp_[0];
164 for (int i = 1; i < nBlocks; i++) {
165 if (temp_[i] < min) min = temp_[i];
166 }
167
168 return min;
169}
170
171__host__ cudaReal gpuMinAbs(cudaReal const * d_in, int size)
172{
173 // Establish GPU resources for this parallel reduction. Divided by two because
174 // of the global memory load in the kernel performing the first level of reduction!
175 int nBlocks, nThreads;
176 int halvedsize = ceil((float)size/2);
177
178 ThreadGrid::setThreadsLogical(halvedsize,nBlocks,nThreads);
179
180 // Set up temporary, small device and host arrays for storing reduced data
181 // temp_ will handle excess and reduced and sum them up
182 cudaReal* temp_ = new cudaReal[nBlocks];
183 cudaReal* d_temp_;
184 gpuErrchk(cudaMalloc((void**) &d_temp_, nBlocks*sizeof(cudaReal)));
185
186 // Perform parallel reduction minimum of the absolute value
187 reductionMinAbs<<<nBlocks, nThreads, nThreads*sizeof(cudaReal)>>>(d_temp_, d_in, size);
188
189 // Load the partially reduced minimum of the absolute value result to temp_
190 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*sizeof(cudaReal), cudaMemcpyDeviceToHost));
191
192 // Perform final comparison on CPU
193 cudaReal min = temp_[0];
194 for (int i = 1; i < nBlocks; i++) {
195 if (temp_[i] < min) min = temp_[i];
196 }
197
198 return min;
199}
200
201}
202}
203#endif
int nThreads()
Get the number of threads per block for execution.
Definition: ThreadGrid.cu:173
int nBlocks()
Get the current number of blocks for execution.
Definition: ThreadGrid.cu:170
void setThreadsLogical(int nThreadsLogical)
Set the total number of threads required for execution.
Definition: ThreadGrid.cu:80
C++ namespace for polymer self-consistent field theory (PSCF).