91 for (
int i = 0; i < D; i++) {
96 if (MAX_THREADS_PER_BLOCK == -1) {
102 if ((blockSize > 0) && (blockSize != THREADS_PER_BLOCK)) {
105 if (
meshDims[D-1] != MESH_DIMS.x) match =
false;
106 if (D > 1) {
if (
meshDims[D-2] != MESH_DIMS.y) match =
false; }
107 if (D > 2) {
if (
meshDims[D-3] != MESH_DIMS.z) match =
false; }
109 if (
meshDims[0] != MESH_DIMS.x) match =
false;
110 if (D > 1) {
if (
meshDims[1] != MESH_DIMS.y) match =
false; }
111 if (D > 2) {
if (
meshDims[2] != MESH_DIMS.z) match =
false; }
124 if (D > 1) MESH_DIMS.y =
meshDims[D-2];
125 if (D > 2) MESH_DIMS.z =
meshDims[D-3];
128 if (D > 1) MESH_DIMS.y =
meshDims[1];
129 if (D > 2) MESH_DIMS.z =
meshDims[2];
133 bool manualBlockSize =
false;
135 blockSize = MAX_THREADS_PER_BLOCK;
137 manualBlockSize =
true;
138 if ((blockSize & (blockSize - 1)) != 0) {
139 UTIL_THROW(
"Manual block size entry must be a power of 2.");
141 if (blockSize < WARP_SIZE) {
142 Log::file() <<
"\nRequested threads per block: " << blockSize
143 <<
"\nWarp size: " << WARP_SIZE << std::endl;
144 UTIL_THROW(
"Threads per block cannot be smaller than warp size.");
147 THREADS_PER_BLOCK = blockSize;
151 if (MESH_DIMS.x % WARP_SIZE == 0) {
155 BLOCK_DIMS.x = WARP_SIZE;
156 }
else if ((MESH_DIMS.x & (MESH_DIMS.x - 1)) == 0) {
160 BLOCK_DIMS.x = MESH_DIMS.x;
165 if ((MESH_DIMS.x / (WARP_SIZE/2)) % 2 == 0) {
167 BLOCK_DIMS.x = WARP_SIZE / 2;
170 BLOCK_DIMS.x = WARP_SIZE;
176 while (BLOCK_DIMS.y < MESH_DIMS.y) {
183 while (BLOCK_DIMS.z < MESH_DIMS.z) {
188 if (BLOCK_DIMS.z > 64) {
193 GRID_DIMS.z = MESH_DIMS.z / BLOCK_DIMS.z;
194 if (MESH_DIMS.z % BLOCK_DIMS.z) GRID_DIMS.z++;
197 blockSize = BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z;
198 if (blockSize >= THREADS_PER_BLOCK) {
200 while (blockSize > THREADS_PER_BLOCK) {
202 int yOvershoot = BLOCK_DIMS.y * GRID_DIMS.y - MESH_DIMS.y;
203 int zOvershoot = BLOCK_DIMS.z * GRID_DIMS.z - MESH_DIMS.z;
206 if ((yOvershoot > zOvershoot) || (BLOCK_DIMS.z == 1)) {
209 GRID_DIMS.y = MESH_DIMS.y / BLOCK_DIMS.y;
210 if (MESH_DIMS.y % BLOCK_DIMS.y) GRID_DIMS.y++;
213 GRID_DIMS.z = MESH_DIMS.z / BLOCK_DIMS.z;
214 if (MESH_DIMS.z % BLOCK_DIMS.z) GRID_DIMS.z++;
217 blockSize = BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z;
225 while ((BLOCK_DIMS.x < MESH_DIMS.x) && (blockSize < THREADS_PER_BLOCK))
230 if (blockSize < WARP_SIZE) {
233 while (blockSize < WARP_SIZE) {
239 THREADS_PER_BLOCK = blockSize;
243 GRID_DIMS.x = MESH_DIMS.x / BLOCK_DIMS.x;
244 if (MESH_DIMS.x % BLOCK_DIMS.x) GRID_DIMS.x++;
247 if ((manualBlockSize) && (blockSize < THREADS_PER_BLOCK)) {
248 Log::file() <<
"WARNING: The number of threads per block ("
250 <<
") will be smaller than \nthe requested size of "
251 << THREADS_PER_BLOCK <<
"." << std::endl;
255 if ((MESH_DIMS.x % BLOCK_DIMS.x) || (MESH_DIMS.y % BLOCK_DIMS.y) ||
256 (MESH_DIMS.z % BLOCK_DIMS.z)) {
257 UNUSED_THREADS =
true;
259 UNUSED_THREADS =
false;
276 if ((MAX_THREADS_PER_BLOCK & (MAX_THREADS_PER_BLOCK - 1)) != 0) {
277 Log::file() <<
"\nMax threads per block: " << MAX_THREADS_PER_BLOCK
279 UTIL_THROW(
"Max threads per block must be a power of two.");
283 if (MAX_THREADS_PER_BLOCK % WARP_SIZE != 0)
285 Log::file() <<
"\nMax threads per block: " << MAX_THREADS_PER_BLOCK
286 <<
"\nWarp size: " << WARP_SIZE << std::endl;
287 UTIL_THROW(
"Max threads per block must be a multiple of warp size.");
291 if (THREADS_PER_BLOCK % WARP_SIZE != 0)
293 Log::file() <<
"\nThreads per block: " << THREADS_PER_BLOCK
294 <<
"\nWarp size: " << WARP_SIZE << std::endl;
295 UTIL_THROW(
"Threads per block must be a multiple of warp size.");
299 if ((BLOCK_DIMS.x < 1) || (BLOCK_DIMS.y < 1) || (BLOCK_DIMS.z < 1)) {
300 Log::file() <<
"\nBlock dimensions: " << BLOCK_DIMS.x <<
" "
301 << BLOCK_DIMS.y <<
" " << BLOCK_DIMS.z << std::endl;
302 UTIL_THROW(
"Block dimensions must each be a power of two.");
306 if (BLOCK_DIMS.x * BLOCK_DIMS.y * BLOCK_DIMS.z != THREADS_PER_BLOCK) {
307 UTIL_THROW(
"THREADS_PER_BLOCK not properly set.");
311 if ((BLOCK_DIMS.x * GRID_DIMS.x < MESH_DIMS.x) ||
312 (BLOCK_DIMS.y * GRID_DIMS.y < MESH_DIMS.y) ||
313 (BLOCK_DIMS.z * GRID_DIMS.z < MESH_DIMS.z)) {
314 Log::file() <<
"\nBlock dimensions: " << BLOCK_DIMS.x <<
" "
315 << BLOCK_DIMS.y <<
" " << BLOCK_DIMS.z << std::endl;
316 Log::file() <<
"\nGrid dimensions: " << GRID_DIMS.x <<
" "
317 << GRID_DIMS.y <<
" " << GRID_DIMS.z << std::endl;
318 Log::file() <<
"\nMesh dimensions: " << MESH_DIMS.x <<
" "
319 << MESH_DIMS.y <<
" " << MESH_DIMS.z << std::endl;
320 UTIL_THROW(
"Thread grid smaller than the requested mesh.");
328 if (MAX_THREADS_PER_SM % MAX_THREADS_PER_BLOCK != 0) {
329 Log::file() <<
"WARNING: The number of threads per block ("
330 << MAX_THREADS_PER_BLOCK
331 <<
") is not an even divisor of the maximum number"
332 <<
" of threads per streaming multiprocessor ("
333 << MAX_THREADS_PER_SM
334 <<
"). Performance will be suboptimal."