PSCF v1.1
ThreadGrid.cu
1#ifndef PSPG_THREADGRID_CU
2#define PSPG_THREADGRID_CU
3
4#include "ThreadGrid.h"
5
6namespace {
7
8 // Anonymous namespace containing "static" variables only used by global
9 // functions defined in namespace ThreadGrid. These are thus persistent
10 // pseudo-private variables, much like private static class variables.
11
12 // Maximum threads per block, either set by querying hardware or by user.
13 int MAX_THREADS_PER_BLOCK = -1;
14
15 // Number of threads per block for execution. Determined by setThreadsLogical.
16 int THREADS_PER_BLOCK = -1;
17
18 // Number of blocks for execution. Determined by setThreadsLogical.
19 int BLOCKS = -1;
20
21 // Total number of threads requested for execution. Set by setThreadsLogical.
22 int THREADS_LOGICAL = -1;
23
24 // Will threads go unused?
25 bool UNUSED_THREADS;
26
27}
28
29namespace Pscf {
30namespace Pspg {
31namespace ThreadGrid {
32
33 using namespace Util;
34
35 void init()
36 {
37 // Check that a CUDA device is available.
38 int count = 0;
39 cudaGetDeviceCount(&count);
40
41 if (count == 0)
42 UTIL_THROW("No CUDA devices found.");
43
44 // Set a default maximum threads per block by querying hardware.
46 }
47
49 {
50 cudaDeviceProp dprop;
51 // Get properties, assuming one GPU.
52 cudaGetDeviceProperties(&dprop, 0);
53 int maxThPerSM = dprop.maxThreadsPerMultiProcessor;
54
55 // Find the highest power of two that evenly divides into the
56 // maximum number of threads per streaming multiprocessor
57 // This will lead to the highest occupancy!
58
59 int threadsPerBlock = (maxThPerSM & (~(maxThPerSM - 1)));
60
61 // Check for validity:
62 while (true) {
63 if (threadsPerBlock > dprop.maxThreadsPerBlock)
64 threadsPerBlock /= 2;
65 else
66 break;
67 }
68
69 setThreadsPerBlock(threadsPerBlock);
70 }
71
72 void setThreadsPerBlock(int nThreadsPerBlock)
73 {
74 MAX_THREADS_PER_BLOCK = nThreadsPerBlock;
75 BLOCKS = 0;
76 THREADS_LOGICAL = 0;
78 }
79
81 {
82 // Verify that requested threads is valid (greater than 0).
84
85 // If max_threads_per_block hasn't been set at all, initialize.
86 if (MAX_THREADS_PER_BLOCK == -1)
87 init();
88
89 // Check if requested number of threads matches the previous request
90 if (THREADS_LOGICAL == nThreadsLogical) {
91 // Do nothing. Previous execution configuration will be used.
92 return;
93 }
94
95 // Set the number of total requested threads.
96 THREADS_LOGICAL = nThreadsLogical;
97
98 // Compute the execution configuration.
99 // Number of blocks rounded up to the nearest integer.
100 THREADS_PER_BLOCK = MAX_THREADS_PER_BLOCK;
101 BLOCKS = ceil(double(nThreadsLogical)/double(THREADS_PER_BLOCK));
102
103 // Determine if there will be unused threads
104 UNUSED_THREADS = (BLOCKS*THREADS_PER_BLOCK > THREADS_LOGICAL);
105
106 }
107
109 {
111
112 nBlocks = BLOCKS;
113 }
114
115 void
117 {
119
120 nBlocks = BLOCKS;
121 nThreads = THREADS_PER_BLOCK;
122 }
123
125 {
126 // Get relevant device hardware properties, assuming one device.
127 cudaDeviceProp dprop;
128 cudaGetDeviceProperties(&dprop, 0);
129 int warpSize = dprop.warpSize;
130 int maxThreadsPerMultiProcessor = dprop.maxThreadsPerMultiProcessor;
131
132 // Check that threads per block is a power of two.
133 // This is required for parallel reductions.
134 if ((MAX_THREADS_PER_BLOCK & (MAX_THREADS_PER_BLOCK - 1)) != 0) {
135 UTIL_THROW("Set number of threads per block must be a power of two.");
136 }
137
138 // Check that threads per block is multiple of warpSize.
139 // This is required because a warp is generally 32.
140 if (MAX_THREADS_PER_BLOCK%warpSize != 0)
141 {
142 char buffer[100];
143 sprintf(buffer,
144 "Number of threads per block must be a multiple of warp size %d.\n",
145 warpSize);
146 UTIL_THROW(buffer);
147 }
148
149 // Check that the maximum number of threads per multiprocessor is an
150 // integer multiple of the threads per block. This is not required for
151 // validity, but performance will be suboptimal if not the case, as it
152 // will limit the total number of threads that can be scheduled at any
153 // given time.
154
155 if (maxThreadsPerMultiProcessor % MAX_THREADS_PER_BLOCK%warpSize != 0) {
156 std::cerr
157 << "WARNING: The number of threads per block ("
158 << MAX_THREADS_PER_BLOCK
159 << ") is not an even divisor of the maximum number"
160 << " of threads per streaming multiprocessor ("
161 << maxThreadsPerMultiProcessor
162 << "). Performance will be suboptimal."
163 << std::endl;
164 }
165
166 }
167
168 // Accessors
169
171 { return BLOCKS; }
172
174 { return THREADS_PER_BLOCK; }
175
177 { return THREADS_LOGICAL; }
178
180 { return UNUSED_THREADS; }
181
182}
183}
184}
185#endif
#define UTIL_THROW(msg)
Macro for throwing an Exception, reporting function, file and line number.
Definition: global.h:51
#define UTIL_ASSERT(condition)
Assertion macro suitable for debugging serial or parallel code.
Definition: global.h:75
bool hasUnusedThreads()
Indicates whether there will be unused threads.
Definition: ThreadGrid.cu:179
int nThreadsLogical()
Return previously requested total number of threads.
Definition: ThreadGrid.cu:176
int nThreads()
Get the number of threads per block for execution.
Definition: ThreadGrid.cu:173
int nBlocks()
Get the current number of blocks for execution.
Definition: ThreadGrid.cu:170
void setThreadsPerBlock()
Set the number of threads per block to a default value.
Definition: ThreadGrid.cu:48
void checkExecutionConfig()
Check the execution configuration (threads and block counts).
Definition: ThreadGrid.cu:124
void setThreadsLogical(int nThreadsLogical)
Set the total number of threads required for execution.
Definition: ThreadGrid.cu:80
void init()
Initialize static variables in Pspg::ThreadGrid namespace.
Definition: ThreadGrid.cu:35
C++ namespace for polymer self-consistent field theory (PSCF).
Utility classes for scientific computation.
Definition: accumulators.mod:1