PSCF v1.2
ThreadMesh.cu
1/*
2* PSCF Package
3*
4* Copyright 2016 - 2022, The Regents of the University of Minnesota
5* Distributed under the terms of the GNU General Public License.
6*/
7
8#include "ThreadMesh.h"
9#include <cuda_runtime.h>
10
11namespace Pscf {
12namespace ThreadMesh {
13
14 using namespace Util;
15
16 namespace {
17
18 /*
19 * Anonymous namespace containing "static" variables and functions only
20 * used by global functions defined in namespace ThreadMesh. These are
21 * thus persistent pseudo-private variables and functions, much like
22 * private static class variables.
23 */
24
25 // Max threads per streaming multiprocessor, set by querying hardware.
26 int MAX_THREADS_PER_SM = -1;
27
28 // Max threads per block, set by querying hardware and optimizing.
29 int MAX_THREADS_PER_BLOCK = -1;
30
31 // Threads per block in current configuration.
32 // This may be either user-defined or equal to MAX_THREADS_PER_BLOCK.
33 int THREADS_PER_BLOCK = -1;
34
35 // Number of threads per warp.
36 int WARP_SIZE = -1;
37
38 // Multidimensional array of threads requested for execution.
39 dim3 MESH_DIMS(0, 0, 0);
40
41 // Dimensions of multidimensional thread block for execution.
42 dim3 BLOCK_DIMS(0, 0, 0);
43
44 // Dimensions of multidimensional grid of blocks for execution.
45 dim3 GRID_DIMS(0, 0, 0);
46
47 // Will threads go unused?
48 bool UNUSED_THREADS;
49
50 /*
51 * Initialize the ThreadMesh namespace
52 */
53 void init()
54 {
55 // Check that a CUDA device is available.
56 int count = 0;
57 cudaGetDeviceCount(&count);
58
59 if (count == 0) {
60 UTIL_THROW("No CUDA devices found.");
61 } else if (count > 1) {
62 Log::file() << "\nWarning: multiple GPUs detected.\n"
63 << "This program is not compatible with multiple devices.\n"
64 << "Only the first device will be used." << std::endl;
65 }
66
67 // Get properties, assuming one GPU.
68 cudaDeviceProp dprop;
69 cudaGetDeviceProperties(&dprop, 0);
70 WARP_SIZE = dprop.warpSize;
71 MAX_THREADS_PER_SM = dprop.maxThreadsPerMultiProcessor;
72
73 // Find the highest power of two that evenly divides into the
74 // maximum number of threads per streaming multiprocessor
75 // This will lead to the highest occupancy!
76 MAX_THREADS_PER_BLOCK = (MAX_THREADS_PER_SM &
77 (~(MAX_THREADS_PER_SM - 1)));
78
79 // If this value is > maxThreadsPerBlock, reduce
80 while (MAX_THREADS_PER_BLOCK > dprop.maxThreadsPerBlock) {
81 MAX_THREADS_PER_BLOCK /= 2;
82 }
83 }
84
85 }
86
87 template <int D>
88 void setConfig(IntVec<D> const & meshDims, bool invert, int blockSize)
89 {
90 // Verify that requested thread grid is valid
91 for (int i = 0; i < D; i++) {
92 UTIL_CHECK(meshDims[i] > 0);
93 }
94
95 // If MAX_THREADS_PER_BLOCK hasn't been set at all, initialize
96 if (MAX_THREADS_PER_BLOCK == -1) {
97 init();
98 }
99
100 // Check if requested number of threads matches the previous request
101 bool match = true;
102 if ((blockSize > 0) && (blockSize != THREADS_PER_BLOCK)) {
103 match = false;
104 } else if (invert) {
105 if (meshDims[D-1] != MESH_DIMS.x) match = false;
106 if (D > 1) { if (meshDims[D-2] != MESH_DIMS.y) match = false; }
107 if (D > 2) { if (meshDims[D-3] != MESH_DIMS.z) match = false; }
108 } else {
109 if (meshDims[0] != MESH_DIMS.x) match = false;
110 if (D > 1) { if (meshDims[1] != MESH_DIMS.y) match = false; }
111 if (D > 2) { if (meshDims[2] != MESH_DIMS.z) match = false; }
112 }
113
114 if (match) {
115 // Do nothing. Reuse previous execution configuration.
116 return;
117 }
118
119 // Store meshDims in MESH_DIMS in dim3 format
120 MESH_DIMS.y = 1;
121 MESH_DIMS.z = 1;
122 if (invert) {
123 MESH_DIMS.x = meshDims[D-1];
124 if (D > 1) MESH_DIMS.y = meshDims[D-2];
125 if (D > 2) MESH_DIMS.z = meshDims[D-3];
126 } else {
127 MESH_DIMS.x = meshDims[0];
128 if (D > 1) MESH_DIMS.y = meshDims[1];
129 if (D > 2) MESH_DIMS.z = meshDims[2];
130 }
131
132 // Assign the total block size parameter if not provided
133 bool manualBlockSize = false;
134 if (blockSize < 0) {
135 blockSize = MAX_THREADS_PER_BLOCK;
136 } else {
137 manualBlockSize = true;
138 if ((blockSize & (blockSize - 1)) != 0) { // blockSize not power of 2
139 UTIL_THROW("Manual block size entry must be a power of 2.");
140 }
141 if (blockSize < WARP_SIZE) {
142 Log::file() << "\nRequested threads per block: " << blockSize
143 << "\nWarp size: " << WARP_SIZE << std::endl;
144 UTIL_THROW("Threads per block cannot be smaller than warp size.");
145 }
146 }
147 THREADS_PER_BLOCK = blockSize;
148
149 // Set BLOCK_DIMS.x to maximize opportunities for coalescence.
150 // (Note: we restrict block dimensions to powers of 2.)
151 if (MESH_DIMS.x % WARP_SIZE == 0) {
152 // If MESH_DIMS.x is a multiple of WARP_SIZE, BLOCK_DIMS.x can be
153 // set to WARP_SIZE, resulting in no unused threads along x and
154 // optimal coalescence.
155 BLOCK_DIMS.x = WARP_SIZE;
156 } else if ((MESH_DIMS.x & (MESH_DIMS.x - 1)) == 0) {
157 // MESH_DIMS.x is a small power of two, so we can set BLOCK_DIMS.x
158 // equal to MESH_DIMS.x and have no unused threads along x,
159 // resulting in optimal coalescence.
160 BLOCK_DIMS.x = MESH_DIMS.x;
161 } else {
162 // MESH_DIMS.x is not a power of 2 or multiple of WARP_SIZE.
163 // Choose block dimensions to be either a warp or half-warp in the
164 // x dimension depending on the number of unused threads.
165 if ((MESH_DIMS.x / (WARP_SIZE/2)) % 2 == 0) {
166 // half warp will have smaller number of unused threads
167 BLOCK_DIMS.x = WARP_SIZE / 2;
168 } else {
169 // half and full warp will have same number of unused threads
170 BLOCK_DIMS.x = WARP_SIZE;
171 }
172 }
173
174 // Set BLOCK_DIMS.y to lowest power of 2 that is >= MESH_DIMS.y
175 BLOCK_DIMS.y = 1;
176 while (BLOCK_DIMS.y < MESH_DIMS.y) {
177 BLOCK_DIMS.y *= 2;
178 }
179 GRID_DIMS.y = 1; // set GRID_DIMS.y
180
181 // Set BLOCK_DIMS.z to lowest power of 2 that is >= MESH_DIMS.z
182 BLOCK_DIMS.z = 1;
183 while (BLOCK_DIMS.z < MESH_DIMS.z) {
184 BLOCK_DIMS.z *= 2;
185 }
186
187 // BLOCK_DIMS.z cannot exceed 64. Adjust if needed.
188 if (BLOCK_DIMS.z > 64) {
189 BLOCK_DIMS.z = 64;
190 }
191
192 // Set GRID_DIMS.z
193 GRID_DIMS.z = MESH_DIMS.z / BLOCK_DIMS.z;
194 if (MESH_DIMS.z % BLOCK_DIMS.z) GRID_DIMS.z++;
195
196 // Shrink BLOCK_DIMS until block size size equals THREADS_PER_BLOCK
197 blockSize = BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z;
198 if (blockSize >= THREADS_PER_BLOCK) {
199
200 while (blockSize > THREADS_PER_BLOCK) {
201 // Determine how many unused gridpoints lie along y and z
202 int yOvershoot = BLOCK_DIMS.y * GRID_DIMS.y - MESH_DIMS.y;
203 int zOvershoot = BLOCK_DIMS.z * GRID_DIMS.z - MESH_DIMS.z;
204
205 // Halve the element of BLOCK_DIMS with largest overshoot
206 if ((yOvershoot > zOvershoot) || (BLOCK_DIMS.z == 1)) {
207 UTIL_CHECK(BLOCK_DIMS.y > 1);
208 BLOCK_DIMS.y /= 2;
209 GRID_DIMS.y = MESH_DIMS.y / BLOCK_DIMS.y;
210 if (MESH_DIMS.y % BLOCK_DIMS.y) GRID_DIMS.y++;
211 } else {
212 BLOCK_DIMS.z /= 2;
213 GRID_DIMS.z = MESH_DIMS.z / BLOCK_DIMS.z;
214 if (MESH_DIMS.z % BLOCK_DIMS.z) GRID_DIMS.z++;
215 }
216
217 blockSize = BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z;
218 }
219 UTIL_CHECK(blockSize == THREADS_PER_BLOCK);
220 UTIL_CHECK(blockSize % WARP_SIZE == 0);
221
222 } else {
223 // blockSize is less than THREADS_PER_BLOCK.
224 // BLOCK_DIMS.x can be increased.
225 while ((BLOCK_DIMS.x < MESH_DIMS.x) && (blockSize < THREADS_PER_BLOCK))
226 {
227 BLOCK_DIMS.x *= 2;
228 blockSize *= 2;
229 }
230 if (blockSize < WARP_SIZE) {
231 // Make sure blockSize is at least one warp
232 UTIL_CHECK(blockSize > 0);
233 while (blockSize < WARP_SIZE) {
234 BLOCK_DIMS.x *= 2;
235 blockSize *= 2;
236 }
237 }
238
239 THREADS_PER_BLOCK = blockSize;
240 }
241
242 // Set GRID_DIMS.x
243 GRID_DIMS.x = MESH_DIMS.x / BLOCK_DIMS.x;
244 if (MESH_DIMS.x % BLOCK_DIMS.x) GRID_DIMS.x++;
245
246 // If block is smaller than manually requested size, print warning
247 if ((manualBlockSize) && (blockSize < THREADS_PER_BLOCK)) {
248 Log::file() << "WARNING: The number of threads per block ("
249 << blockSize
250 << ") will be smaller than \nthe requested size of "
251 << THREADS_PER_BLOCK << "." << std::endl;
252 }
253
254 // Set UNUSED_THREADS
255 if ((MESH_DIMS.x % BLOCK_DIMS.x) || (MESH_DIMS.y % BLOCK_DIMS.y) ||
256 (MESH_DIMS.z % BLOCK_DIMS.z)) {
257 UNUSED_THREADS = true;
258 } else {
259 UNUSED_THREADS = false;
260 }
261
262 checkConfig();
263 }
264
265 // Instantiate setConfig methods for D = 1, 2, 3
266 template void setConfig<1>(IntVec<1> const & meshDims,
267 bool invert, int blockSize);
268 template void setConfig<2>(IntVec<2> const & meshDims,
269 bool invert, int blockSize);
270 template void setConfig<3>(IntVec<3> const & meshDims,
271 bool invert, int blockSize);
272
274 {
275 // Check that max threads per block is a power of two.
276 if ((MAX_THREADS_PER_BLOCK & (MAX_THREADS_PER_BLOCK - 1)) != 0) {
277 Log::file() << "\nMax threads per block: " << MAX_THREADS_PER_BLOCK
278 << std::endl;
279 UTIL_THROW("Max threads per block must be a power of two.");
280 }
281
282 // Check that max threads per block is multiple of WARP_SIZE.
283 if (MAX_THREADS_PER_BLOCK % WARP_SIZE != 0)
284 {
285 Log::file() << "\nMax threads per block: " << MAX_THREADS_PER_BLOCK
286 << "\nWarp size: " << WARP_SIZE << std::endl;
287 UTIL_THROW("Max threads per block must be a multiple of warp size.");
288 }
289
290 // Check that threads per block is multiple of WARP_SIZE.
291 if (THREADS_PER_BLOCK % WARP_SIZE != 0)
292 {
293 Log::file() << "\nThreads per block: " << THREADS_PER_BLOCK
294 << "\nWarp size: " << WARP_SIZE << std::endl;
295 UTIL_THROW("Threads per block must be a multiple of warp size.");
296 }
297
298 // Check that block dimensions are each 1 or greater
299 if ((BLOCK_DIMS.x < 1) || (BLOCK_DIMS.y < 1) || (BLOCK_DIMS.z < 1)) {
300 Log::file() << "\nBlock dimensions: " << BLOCK_DIMS.x << " "
301 << BLOCK_DIMS.y << " " << BLOCK_DIMS.z << std::endl;
302 UTIL_THROW("Block dimensions must each be a power of two.");
303 }
304
305 // Check that BLOCK_DIMS multiply to THREADS_PER_BLOCK
306 if (BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z != THREADS_PER_BLOCK) {
307 UTIL_THROW("THREADS_PER_BLOCK not properly set.");
308 }
309
310 // Check that the thread grid is at least as big as the requested mesh
311 if ((BLOCK_DIMS.x * GRID_DIMS.x < MESH_DIMS.x) ||
312 (BLOCK_DIMS.y * GRID_DIMS.y < MESH_DIMS.y) ||
313 (BLOCK_DIMS.z * GRID_DIMS.z < MESH_DIMS.z)) {
314 Log::file() << "\nBlock dimensions: " << BLOCK_DIMS.x << " "
315 << BLOCK_DIMS.y << " " << BLOCK_DIMS.z << std::endl;
316 Log::file() << "\nGrid dimensions: " << GRID_DIMS.x << " "
317 << GRID_DIMS.y << " " << GRID_DIMS.z << std::endl;
318 Log::file() << "\nMesh dimensions: " << MESH_DIMS.x << " "
319 << MESH_DIMS.y << " " << MESH_DIMS.z << std::endl;
320 UTIL_THROW("Thread grid smaller than the requested mesh.");
321 }
322
323 // Check that the maximum number of threads per multiprocessor is an
324 // integer multiple of the threads per block. This is not required
325 // for validity, but performance will be suboptimal if not the case,
326 // as it will limit the total number of threads that can be
327 // scheduled at any given time.
328 if (MAX_THREADS_PER_SM % MAX_THREADS_PER_BLOCK != 0) {
329 Log::file() << "WARNING: The number of threads per block ("
330 << MAX_THREADS_PER_BLOCK
331 << ") is not an even divisor of the maximum number"
332 << " of threads per streaming multiprocessor ("
333 << MAX_THREADS_PER_SM
334 << "). Performance will be suboptimal."
335 << std::endl;
336 }
337 }
338
342 void setThreadsPerBlock(int blockSize)
343 {
344 // If MAX_THREADS_PER_BLOCK hasn't been set at all, initialize
345 if (MAX_THREADS_PER_BLOCK == -1) {
346 init();
347 }
348
349 if ((blockSize & (blockSize - 1)) != 0) { // blockSize not power of 2
350 UTIL_THROW("Manual block size entry must be a power of 2.");
351 }
352
353 if (blockSize < WARP_SIZE) {
354 Log::file() << "\nRequested threads per block: " << blockSize
355 << "\nWarp size: " << WARP_SIZE << std::endl;
356 UTIL_THROW("Threads per block cannot be smaller than warp size.");
357 }
358
359 MAX_THREADS_PER_BLOCK = blockSize;
360 }
361
362 // Accessors
363
364 dim3 gridDims()
365 { return GRID_DIMS; }
366
368 { return BLOCK_DIMS; }
369
370 dim3 meshDims()
371 { return MESH_DIMS; }
372
374 { return WARP_SIZE; }
375
377 { return UNUSED_THREADS; }
378
379} // namespace ThreadMesh
380} // namespace Pscf
An IntVec<D, T> is a D-component vector of elements of integer type T.
Definition IntVec.h:27
static std::ostream & file()
Get log ostream by reference.
Definition Log.cpp:57
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
#define UTIL_THROW(msg)
Macro for throwing an Exception, reporting function, file and line number.
Definition global.h:51
int warpSize()
Get the warp size.
void setConfig(IntVec< D > const &meshDims, bool invert, int blockSize)
Given a multidimensional grid of threads, set execution configuration.
Definition ThreadMesh.cu:88
void checkConfig()
Check the execution configuration (grid and block dimensions).
bool hasUnusedThreads()
Will there be unused threads?
dim3 gridDims()
Get the multidimensional grid of blocks determined by setConfig.
dim3 meshDims()
Return last requested multidimensional grid of threads.
dim3 blockDims()
Get the dimensions of a single block determined by setConfig.
void setThreadsPerBlock(int blockSize)
Manually set the block size that should be used by default.
PSCF package top-level namespace.
Definition param_pc.dox:1
Utility classes for scientific computation.