PSCF v1.2
ThreadArray.cu
1#ifndef PSCF_THREAD_ARRAY_CU
2#define PSCF_THREAD_ARRAY_CU
3
4#include "ThreadArray.h"
5#include <cuda_runtime.h>
6
7namespace {
8
9 // Anonymous namespace containing "static" variables only used by global
10 // functions defined in namespace ThreadArray. These are thus persistent
11 // pseudo-private variables, much like private static class variables.
12
13 // Maximum threads per block, either set by querying hardware or by user.
14 int MAX_THREADS_PER_BLOCK = -1;
15
16 // Number of threads per block for execution.
17 // Determined by setThreadsLogical.
18 int THREADS_PER_BLOCK = -1;
19
20 // Number of blocks for execution. Determined by setThreadsLogical.
21 int BLOCKS = -1;
22
23 // Total number of threads requested for execution.
24 // Set by setThreadsLogical.
25 int THREADS_LOGICAL = -1;
26
27 // Number of threads per warp
28 int WARP_SIZE = -1;
29
30 // Will threads go unused?
31 bool UNUSED_THREADS;
32
33}
34
35namespace Pscf {
36namespace ThreadArray {
37
38 using namespace Util;
39
40 void init()
41 {
42 // Check that a CUDA device is available.
43 int count = 0;
44 cudaGetDeviceCount(&count);
45
46 if (count == 0) {
47 UTIL_THROW("No CUDA devices found.");
48 } else if (count > 1) {
49 Log::file() << "\nWarning: multiple GPUs detected.\n"
50 << "This program is not compatible with multiple devices.\n"
51 << "Only the first device will be used." << std::endl;
52 }
53
54 // Set a default maximum threads per block by querying hardware.
56 }
57
59 {
60 cudaDeviceProp dprop;
61 // Get properties, assuming one GPU.
62 cudaGetDeviceProperties(&dprop, 0);
63 int maxThPerSM = dprop.maxThreadsPerMultiProcessor;
64
65 // Find the highest power of two that evenly divides into the
66 // maximum number of threads per streaming multiprocessor
67 // This will lead to the highest occupancy!
68
69 int threadsPerBlock = (maxThPerSM & (~(maxThPerSM - 1)));
70
71 // Check for validity:
72 while (threadsPerBlock > dprop.maxThreadsPerBlock) {
73 threadsPerBlock /= 2;
74 }
75
76 setThreadsPerBlock(threadsPerBlock);
77 }
78
79 void setThreadsPerBlock(int nThreadsPerBlock)
80 {
81 MAX_THREADS_PER_BLOCK = nThreadsPerBlock;
82 BLOCKS = 0;
83 THREADS_LOGICAL = 0;
85 }
86
88 {
89 // Verify that requested threads is valid (greater than 0).
91
92 // If max_threads_per_block hasn't been set at all, initialize.
93 if (MAX_THREADS_PER_BLOCK == -1)
94 init();
95
96 // Check if requested number of threads matches the previous request
97 if (THREADS_LOGICAL == nThreadsLogical) {
98 // Do nothing. Previous execution configuration will be used.
99 return;
100 }
101
102 // Set the number of total requested threads.
103 THREADS_LOGICAL = nThreadsLogical;
104
105 // Compute the execution configuration.
106 // Number of blocks rounded up to the nearest integer.
107 THREADS_PER_BLOCK = MAX_THREADS_PER_BLOCK;
108 BLOCKS = ceil(double(nThreadsLogical)/double(THREADS_PER_BLOCK));
109
110 // Determine if there will be unused threads
111 UNUSED_THREADS = (BLOCKS*THREADS_PER_BLOCK > THREADS_LOGICAL);
112
113 }
114
116 {
118
119 nBlocks = BLOCKS;
120 }
121
122 void
124 {
126
127 nBlocks = BLOCKS;
128 nThreads = THREADS_PER_BLOCK;
129 }
130
132 {
133 // Get relevant device hardware properties, assuming one device.
134 cudaDeviceProp dprop;
135 cudaGetDeviceProperties(&dprop, 0);
136 WARP_SIZE = dprop.warpSize;
137 int maxThreadsPerMultiProcessor = dprop.maxThreadsPerMultiProcessor;
138
139 // Check that threads per block is a power of two.
140 // This is required for parallel reductions.
141 if ((MAX_THREADS_PER_BLOCK & (MAX_THREADS_PER_BLOCK - 1)) != 0) {
142 UTIL_THROW("Threads per block must be a power of two.");
143 }
144
145 // Check that threads per block is multiple of WARP_SIZE.
146 // This is required because a warp is generally 32.
147 if (MAX_THREADS_PER_BLOCK % WARP_SIZE != 0)
148 {
149 char buffer[100];
150 sprintf(buffer,
151 "Threads per block must be a multiple of warp size %d.\n",
152 WARP_SIZE);
153 UTIL_THROW(buffer);
154 }
155
156 // Check that the maximum number of threads per multiprocessor is an
157 // integer multiple of the threads per block. This is not required
158 // for validity, but performance will be suboptimal if not the case,
159 // as it will limit the total number of threads that can be
160 // scheduled at any given time.
161
162 if (maxThreadsPerMultiProcessor % MAX_THREADS_PER_BLOCK != 0)
163 {
164 std::cerr
165 << "WARNING: The number of threads per block ("
166 << MAX_THREADS_PER_BLOCK
167 << ") is not an even divisor of the maximum number"
168 << " of threads per streaming multiprocessor ("
169 << maxThreadsPerMultiProcessor
170 << "). Performance will be suboptimal."
171 << std::endl;
172 }
173
174 }
175
176 // Accessors
177
179 { return BLOCKS; }
180
182 { return THREADS_PER_BLOCK; }
183
185 { return THREADS_LOGICAL; }
186
188 { return WARP_SIZE; }
189
191 { return UNUSED_THREADS; }
192
193}
194}
195#endif
static std::ostream & file()
Get log ostream by reference.
Definition Log.cpp:57
#define UTIL_THROW(msg)
Macro for throwing an Exception, reporting function, file and line number.
Definition global.h:51
#define UTIL_ASSERT(condition)
Assertion macro suitable for debugging serial or parallel code.
Definition global.h:75
void checkExecutionConfig()
Check the execution configuration (threads and block counts).
int warpSize()
Get the warp size.
void init()
Initialize static variables in Pscf::ThreadArray namespace.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
int nThreadsLogical()
Return previously requested total number of threads.
bool hasUnusedThreads()
Indicates whether there will be unused threads.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
int nBlocks()
Get the current number of blocks for execution.
PSCF package top-level namespace.
Definition param_pc.dox:1
Utility classes for scientific computation.