PSCF v1.3
cuda/WaveList.tpp
1#ifndef PRDC_CUDA_WAVE_LIST_TPP
2#define PRDC_CUDA_WAVE_LIST_TPP
3
4/*
5* PSCF - Polymer Self-Consistent Field
6*
7* Copyright 2015 - 2025, The Regents of the University of Minnesota
8* Distributed under the terms of the GNU General Public License.
9*/
10
11#include "WaveList.h"
12
13#include <prdc/cuda/resources.h>
14#include <prdc/cuda/FFT.h>
15#include <prdc/crystal/hasVariableAngle.h>
16#include <pscf/cuda/HostDArray.h>
17#include <pscf/mesh/MeshIterator.h>
18#include <pscf/math/IntVec.h>
19
20namespace Pscf {
21namespace Prdc {
22namespace Cuda {
23
24 // CUDA kernels:
25 // (defined in an anonymous namespace, used only in this file)
26 namespace {
27
28 /*
29 * Compute minimum images of each wavevector, template declaration.
30 */
31 template <int D>
32 __global__ void _computeMinimumImages(int* minImages, cudaReal* kSq,
33 cudaReal const * kBasis,
34 int const * meshDims,
35 int const kSize);
36
37 /*
38 * Compute minimum images of each wavevector, 1D explicit specialization.
39 *
40 * Kernel must be launched with kSize threads. (One thread calculates
41 * one minimum image.)
42 *
43 * When launched, this kernel requires no dynamic memory allocation.
44 */
45 template <>
46 __global__ void _computeMinimumImages<1>(int* minImages, cudaReal* kSq,
47 cudaReal const * kBasis,
48 int const * meshDims,
49 int const kSize)
50 {
51 unsigned int nThreads = blockDim.x * gridDim.x;
52 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
53
54 // Load kBasis and meshDims
55 cudaReal kBasis_ = kBasis[0];
56 int meshDims_ = meshDims[0];
57
58 for (int i = startID; i < kSize; i += nThreads) {
59 int img = minImages[i];
60
61 while (img > (meshDims_>>1)) { // note: x>>1 is same as x/2
62 img -= meshDims_;
63 }
64 while (img < -1*(meshDims_>>1)) {
65 img += meshDims_;
66 }
67
68 minImages[i] = img;
69 kSq[i] = img * img * kBasis_ * kBasis_;
70 }
71 }
72
73 /*
74 * Compute minimum images of each wavevector, 2D explicit specialization.
75 *
76 * This kernel should be launched with >=32*kSize threads.
77 *
78 * 32 threads are used to calculate one minimum image. In the CPU code,
79 * we compare 25 different images of a wave to determine the minimum
80 * image, and here we choose 32 instead because it is a power of 2,
81 * allowing us to use a reduction algorithm to compare the images.
82 *
83 * When launched, this kernel requires dynamic memory allocation of
84 * (2*sizeof(int) + sizeof(cudaReal)) * nThreadsPerBlock.
85 */
86 template <>
87 __global__ void _computeMinimumImages<2>(int* minImages, cudaReal* kSq,
88 cudaReal const * kBasis,
89 int const * meshDims,
90 int const kSize)
91 {
92 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
93
94 // Determine which lattice parameter and which image to evaluate
95 unsigned int paramID = (startID >> 5); // equivalent to startID / 32
96 unsigned int imageID = startID - (paramID * 32); // value in [0, 31]
97
98 // Determine the image that will be evaluated by this thread
99 // (uses integer division, not ideal for speed)
100 int s0, s1;
101 s0 = imageID / 5;
102 s1 = imageID - (s0 * 5);
103 s0 -= 2; // shift values to go from -2 to 4, not 0 to 6
104 s1 -= 2; // shift values to go from -2 to 2, not 0 to 4
105
106 // Set epsilon
107 #ifdef SINGLE_PRECISION
108 cudaReal epsilon = 1.0E-4;
109 #else
110 cudaReal epsilon = 1.0E-8;
111 #endif
112
113 // Declare dynamically allocated arrays in shared memory
114 // (allocated space is shared between cudaReal array and int array)
115 extern __shared__ cudaReal kSqVals_[];
116 int* images_ = (int*)&kSqVals_[blockDim.x];
117
118 if (paramID < kSize) { // only evaluate if on the k-grid
119
120 // Load image from global memory and shift based on s0 and s1
121 images_[2*threadIdx.x] = minImages[paramID] + (s0 * meshDims[0]);
122 images_[2*threadIdx.x+1] = minImages[kSize + paramID] +
123 (s1 * meshDims[1]);
124
125 // Calculate kSq for this wave
126 cudaReal kVec0 = (kBasis[0] * images_[2*threadIdx.x]) +
127 (kBasis[2] * images_[2*threadIdx.x+1]);
128 cudaReal kVec1 = (kBasis[1] * images_[2*threadIdx.x]) +
129 (kBasis[3] * images_[2*threadIdx.x+1]);
130
131 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1);
132 }
133 __syncthreads(); // wait for all threads to finish
134
135 // Perform a parallel reduction on 32 threads to find min image
136 // (note: stride >>= 1 is equivalent to stride /= 2)
137 for (int stride = 16; stride > 0; stride >>= 1) {
138 if (paramID < kSize) { // only evaluate if on the k-grid
139 if (imageID < stride) {
140 bool swap = false;
141 if (kSqVals_[threadIdx.x+stride] <
142 (kSqVals_[threadIdx.x] - epsilon)) {
143 swap = true;
144 } else if (kSqVals_[threadIdx.x+stride] <
145 (kSqVals_[threadIdx.x] + epsilon)) {
146 // kSq values effectively equal.
147 // Determine whether to swap based on hkl indices
148 for (int i = 0; i < 2; ++i) {
149 if (images_[2*threadIdx.x+i] >
150 images_[2*(threadIdx.x+stride)+i]) {
151 break;
152 } else
153 if (images_[2*threadIdx.x+i] <
154 images_[2*(threadIdx.x+stride)+i]) {
155 swap = true;
156 break;
157 }
158 }
159 }
160 if (swap) {
161 images_[2*threadIdx.x] =
162 images_[2*(threadIdx.x+stride)];
163 images_[2*threadIdx.x+1] =
164 images_[2*(threadIdx.x+stride)+1];
165 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
166 }
167 }
168 }
169 // note: no __syncthreads() needed here because this reduction
170 // occurswithin 1 warp, so all threads are already synced)
171 }
172
173 // At this point, for any thread with imageID == 0, the corresponding
174 // entries in kSqVals_ and images_ should contain the kSq value and
175 // the minimum image, respectively. Store values and exit
176 if ((imageID == 0) && (paramID < kSize)) {
177 kSq[paramID] = kSqVals_[threadIdx.x];
178 minImages[paramID] = images_[2*threadIdx.x];
179 minImages[paramID + kSize] = images_[2*threadIdx.x+1];
180 }
181 }
182
183 /*
184 * Compute minimum images of each wavevector, 3D explicit specialization.
185 *
186 * This kernel should be launched with >=128*kSize threads.
187 *
188 * 128 threads are used to calculate one minimum image. In the CPU code,
189 * we compare 125 different images of a wave to determine the minimum
190 * image, and here we choose 128 instead because it is a power of 2,
191 * allowing us to use a reduction algorithm to compare the images.
192 *
193 * When launched, this kernel requires dynamic memory allocation of
194 * (3*sizeof(int) + sizeof(cudaReal)) * nThreadsPerBlock.
195 */
196 template <>
197 __global__ void _computeMinimumImages<3>(int* minImages, cudaReal* kSq,
198 cudaReal const * kBasis,
199 int const * meshDims,
200 int const kSize)
201 {
202 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
203
204 // Determine which lattice parameter and which image to evaluate
205 unsigned int paramID = (startID >> 7); // equivalent to startID / 128
206 unsigned int imageID = startID - (paramID * 128);
207
208 // Determine the image that will be evaluated by this thread
209 // (uses integer division, not ideal for speed)
210 int s0, s1, s2;
211 s0 = imageID / 25;
212 s1 = (imageID - (s0 * 25)) / 5;
213 s2 = imageID - (s0 * 25) - (s1 * 5);
214 s0 -= 2; // shift values to go from -2 to 2
215 s1 -= 2;
216 s2 -= 2;
217
218 // Set epsilon
219 #ifdef SINGLE_PRECISION
220 cudaReal epsilon = 1.0E-4;
221 #else
222 cudaReal epsilon = 1.0E-8;
223 #endif
224
225 // Declare dynamically allocated arrays in shared memory
226 // (allocated space is shared between cudaReal array and int array)
227 extern __shared__ cudaReal kSqVals_[];
228 int* images_ = (int*)&kSqVals_[blockDim.x];
229
230 if (paramID < kSize) { // only evaluate if on the k-grid
231
232 // Load image data from global memory
233 images_[3*threadIdx.x] = minImages[paramID] + (s0 * meshDims[0]);
234 images_[3*threadIdx.x+1] = minImages[kSize + paramID] +
235 (s1 * meshDims[1]);
236 images_[3*threadIdx.x+2] = minImages[kSize + kSize + paramID] +
237 (s2 * meshDims[2]);
238
239 // Calculate kSq for this wave
240 cudaReal kVec0(0.0), kVec1(0.0), kVec2(0.0);
241 for (int k = 0; k < 3; ++k) {
242 kVec0 += kBasis[3*k] * images_[3*threadIdx.x + k];
243 kVec1 += kBasis[3*k+1] * images_[3*threadIdx.x + k];
244 kVec2 += kBasis[3*k+2] * images_[3*threadIdx.x + k];
245 }
246
247 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1) +
248 (kVec2*kVec2);
249 }
250 __syncthreads(); // wait for all threads to finish
251
252 // Perform a parallel reduction on 128 threads to find min image
253 // (note: stride >>= 1 is equivalent to stride /= 2)
254 for (int stride = 64; stride > 0; stride >>= 1) {
255 if (paramID < kSize) { // only evaluate if on the k-grid
256 if (imageID < stride) {
257 bool swap = false;
258 if (kSqVals_[threadIdx.x+stride] <
259 (kSqVals_[threadIdx.x] - epsilon)) {
260 swap = true;
261 } else if (kSqVals_[threadIdx.x+stride] <
262 (kSqVals_[threadIdx.x] + epsilon)) {
263 // kSq values effectively equal.
264 // Determine whether to swap based on hkl indices
265 for (int i = 0; i < 3; ++i) {
266 if (images_[3*threadIdx.x+i] >
267 images_[3*(threadIdx.x+stride)+i]) {
268 break;
269 } else
270 if (images_[3*threadIdx.x+i] <
271 images_[3*(threadIdx.x+stride)+i]) {
272 swap = true;
273 break;
274 }
275 }
276 }
277 if (swap) {
278 images_[3*threadIdx.x] =
279 images_[3*(threadIdx.x+stride)];
280 images_[3*threadIdx.x+1] =
281 images_[3*(threadIdx.x+stride)+1];
282 images_[3*threadIdx.x+2] =
283 images_[3*(threadIdx.x+stride)+2];
284 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
285 }
286 }
287 }
288 __syncthreads(); // wait for all threads to finish
289 }
290
291 // At this point, for any thread with imageID == 0, the corresponding
292 // entries in kSqVals_ and images_ should contain the kSq value and
293 // the minimum image, respectively. Store values and exit
294 if ((imageID == 0) && (paramID < kSize)) {
295 kSq[paramID] = kSqVals_[threadIdx.x];
296 minImages[paramID] = images_[3*threadIdx.x];
297 minImages[paramID + kSize] = images_[3*threadIdx.x+1];
298 minImages[paramID + kSize + kSize] = images_[3*threadIdx.x+2];
299 }
300 }
301
302 /*
303 * Compute the kSq array on the GPU.
304 */
305 template <int D>
306 __global__ void _computeKSq(cudaReal* kSq, int const * waveBz,
307 cudaReal const * kBasis,
308 int const nParams, int const kSize)
309 {
310 int nThreads = blockDim.x * gridDim.x;
311 int startID = blockIdx.x * blockDim.x + threadIdx.x;
312
313 // Load kBasis into shared memory for fast access
314 __shared__ cudaReal kBasis_[D*D];
315 if (threadIdx.x < D * D) {
316 kBasis_[threadIdx.x] = kBasis[threadIdx.x];
317 }
318 __syncthreads(); // wait for all threads to finish
319
320 // Variables to be used in the loop
321 int i, j, k, waveBz_;
322 cudaReal kVec[D], kSqVal;
323 // Note: usually local arrays are very slow in a CUDA kernel, but
324 // the compiler should ideally be able to unroll the loops below
325 // and store the kVec array in the register, because the full
326 // structure of the loops below is known at compile time.
327
328 // Loop through array
329 for (i = startID; i < kSize; i += nThreads) {
330
331 // initialize kVec to 0
332 for (j = 0; j < D; ++j) {
333 kVec[j] = 0.0;
334 }
335
336 // Calculate kVec
337 for (j = 0; j < D; ++j) {
338 waveBz_ = waveBz[i + (j * kSize)];
339 for (k = 0; k < D; ++k) {
340 kVec[k] += kBasis_[k + (D*j)] * waveBz_;
341 }
342 }
343
344 // Compute kSq
345 kSqVal = 0.0;
346 for (j = 0; j < D; ++j) {
347 kSqVal += kVec[j] * kVec[j];
348 }
349
350 // Store value in global memory
351 kSq[i] = kSqVal;
352
353 } // kSize
354 }
355
356 /*
357 * Compute the dKSq array on the GPU
358 */
359 template <int D>
360 __global__ void _computedKSq(cudaReal* dKSq, int const * waveBz,
361 cudaReal const * dkkBasis,
362 bool const * implicitInverse,
363 int const nParams, int const kSize,
364 bool isRealField)
365 {
366 // Size of dKSq is kSize * nParams
367 // Each thread does nParams calculations
368 int nThreads = blockDim.x * gridDim.x;
369 int startID = blockIdx.x * blockDim.x + threadIdx.x;
370
371 // Load dkkBasis into shared memory for fast access
372 // (max size of dkkBasis is 54 elements for triclinic unit cell)
373 extern __shared__ cudaReal dkkBasis_[];
374 if (threadIdx.x < nParams * D * D) {
375 dkkBasis_[threadIdx.x] = dkkBasis[threadIdx.x];
376 }
377 __syncthreads(); // wait for all threads to finish
378
379 // Variables to be used in the loop
380 int param, i, j, k, waveBz_[D];
381 cudaReal dKSqVal;
382
383 // Loop through array
384 for (param = 0; param < nParams; ++param) {
385 for (i = startID; i < kSize; i += nThreads) {
386
387 // initialize to 0
388 dKSqVal = 0.0;
389
390 // Load waveBz to local memory
391 for (j = 0; j < D; ++j) {
392 waveBz_[j] = waveBz[i + (j * kSize)];
393 }
394
395 // Compute dKSq
396 for (j = 0; j < D; ++j) {
397 for (k = 0; k < D; ++k) {
398 dKSqVal += waveBz_[j] * waveBz_[k]
399 * dkkBasis_[k + (j * D) + (param * D * D)];
400 } // D
401 } // D
402
403 if (isRealField) {
404 if (implicitInverse[i]) {
405 // if element i's inverse is implicit
406 dKSqVal *= 2;
407 }
408 }
409
410 dKSq[(param * kSize) + i] = dKSqVal;
411
412 } // kSize
413 } // nParams
414 }
415
416 } // anonymous namespace
417
418 /*
419 * Constructor.
420 */
421 template <int D>
422 WaveList<D>::WaveList(bool isRealField)
423 : kSize_(0),
424 isAllocated_(false),
425 hasMinImages_(false),
426 hasMinImages_h_(false),
427 hasKSq_(false),
428 hasdKSq_(false),
429 unitCellPtr_(nullptr),
430 meshPtr_(nullptr)
431 { isRealField_ = isRealField; }
432
433 /*
434 * Destructor.
435 */
436 template <int D>
437 WaveList<D>::~WaveList()
438 {}
439
440 /*
441 * Allocate memory, construct implicitInverse_.
442 */
443 template <int D>
444 void WaveList<D>::allocate(Mesh<D> const & m, UnitCell<D> const & c)
445 {
446 UTIL_CHECK(m.size() > 0);
447 UTIL_CHECK(c.nParameter() > 0);
448 UTIL_CHECK(!isAllocated_);
449
450 // Create permanent associations with mesh and unit cell
451 unitCellPtr_ = &c;
452 meshPtr_ = &m;
453
454 int nParams = unitCell().nParameter();
455 IntVec<D> const & meshDimensions = mesh().dimensions();
456
457 // Compute kMeshDimensions_ and kSize_
458 if (isRealField_) {
459 FFT<D>::computeKMesh(meshDimensions, kMeshDimensions_, kSize_);
460 } else {
461 kMeshDimensions_ = meshDimensions;
462 kSize_ = mesh().size();
463 }
464
465 minImages_.allocate(kSize_ * D);
466 kSq_.allocate(kMeshDimensions_);
467 dKSq_.allocate(kSize_ * nParams);
468 dKSqSlices_.allocate(nParams);
469 for (int i = 0; i < nParams; i++) {
470 dKSqSlices_[i].associate(dKSq_, i*kSize_, kMeshDimensions_);
471 }
472
473 // Allocate and set up implicitInverse_ array if isRealField_ == true
474 // (only depends on mesh dimensions, only used for real fields)
475 if (isRealField_) {
476 implicitInverse_.allocate(kSize_);
477 MeshIterator<D> kItr(kMeshDimensions_);
478 HostDArray<bool> implicitInverse_h(kSize_);
479 int inverseId;
480 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
481 if (kItr.position(D-1) == 0) {
482 inverseId = 0;
483 } else {
484 inverseId = mesh().dimension(D-1) - kItr.position(D-1);
485 }
486 if (inverseId > kMeshDimensions_[D-1]) {
487 implicitInverse_h[kItr.rank()] = true;
488 } else {
489 implicitInverse_h[kItr.rank()] = false;
490 }
491 }
492 implicitInverse_ = implicitInverse_h; // transfer to device memory
493 }
494
495 clearUnitCellData();
496 isAllocated_ = true;
497 }
498
499 /*
500 * Clear data that depends on unit cell parameters.
501 */
502 template <int D>
503 void WaveList<D>::clearUnitCellData()
504 {
505 hasKSq_ = false;
506 hasdKSq_ = false;
507 if (hasVariableAngle<D>(unitCell().lattice())) {
508 hasMinImages_ = false;
509 hasMinImages_h_ = false;
510 }
511 }
512
513 /*
514 * Compute minimum image vectors and kSq.
515 */
516 template <int D>
517 void WaveList<D>::computeMinimumImages()
518 {
519 // If min images are valid, return immediately
520 if (hasMinImages_) return;
521
522 // Precondition
523 UTIL_CHECK(isAllocated_);
524 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
525 UTIL_CHECK(unitCell().isInitialized());
526 UTIL_CHECK(minImages_.capacity() == kSize_ * D);
527
528 // Set initial array of images to contain the k-grid points
529 HostDArray<int> imagesTmp(D*kSize_);
530 MeshIterator<D> kItr(kMeshDimensions_);
531 for (int i = 0; i < D; i++) {
532 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
533 imagesTmp[kItr.rank() + (i*kSize_)] = kItr.position(i);
534 }
535 }
536 minImages_ = imagesTmp; // copy to device
537
538 // Get kBasis and meshDims and store on device
539 HostDArray<cudaReal> kBasis_h(D*D);
540 HostDArray<int> meshDims_h(D);
541 DeviceArray<cudaReal> kBasis(D*D);
542 DeviceArray<int> meshDims(D);
543 int idx = 0;
544 for(int j = 0; j < D; ++j) {
545 for(int k = 0; k < D; ++k) {
546 kBasis_h[idx] = unitCell().kBasis(j)[k];
547 idx++;
548 }
549 meshDims_h[j] = mesh().dimension(j);
550 }
551 kBasis = kBasis_h;
552 meshDims = meshDims_h;
553
554 // Set number of threads per gridpoint (depends on D)
555 int threadsPerGP;
556 if (D == 3) {
557 threadsPerGP = 128;
558 } else if (D == 2) {
559 threadsPerGP = 32;
560 } else if (D == 1) {
561 threadsPerGP = 1;
562 }
563
564 // GPU resources
565 int nBlocks, nThreads;
566 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
567
568 if ((D == 3) && (nThreads < 128)) {
569 // Thread blocks too small. Manually set nThreads to 128
570 ThreadArray::setThreadsPerBlock(128);
571 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
572
573 // If the above was successful, print warning
574 Log::file() << "Warning: "
575 << "nThreads too small for computeMinimumImages.\n"
576 << "Setting nThreads equal to 128." << std::endl;
577 }
578
579 // Launch kernel
580 size_t sz = (D * sizeof(int) + sizeof(cudaReal)) * nThreads;
581 _computeMinimumImages<D><<<nBlocks, nThreads, sz>>>
582 (minImages_.cArray(), kSq_.cArray(), kBasis.cArray(),
583 meshDims.cArray(), kSize_);
584
585 hasMinImages_ = true;
586 hasKSq_ = true;
587 }
588
589 /*
590 * Compute values of k^2, using existing minImages if possible.
591 */
592 template <int D>
593 void WaveList<D>::computeKSq()
594 {
595 // If kSq values are valid, return immediately without recomputing
596 if (hasKSq_) return;
597
598 // If necessary, compute minimum images
599 if (!hasMinImages_) {
600 computeMinimumImages(); // compute both min images and kSq
601 return;
602 }
603
604 // If this point is reached, calculate kSq using _computeKSq kernel
605
606 // Precondition
607 UTIL_CHECK(unitCell().nParameter() > 0);
608 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
609 UTIL_CHECK(unitCell().isInitialized());
610
611 // Get kBasis and store on device
612 HostDArray<cudaReal> kBasis_h(D*D);
613 DeviceArray<cudaReal> kBasis(D*D);
614 int idx = 0;
615 for(int j = 0; j < D; ++j) {
616 for(int k = 0; k < D; ++k) {
617 kBasis_h[idx] = unitCell().kBasis(j)[k];
618 idx++;
619 }
620 }
621 kBasis = kBasis_h;
622
623 // GPU resources
624 int nBlocks, nThreads;
625 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
626
627 // Launch kernel to calculate kSq on device
628 _computeKSq<D><<<nBlocks, nThreads>>>
629 (kSq_.cArray(), minImages_.cArray(), kBasis.cArray(),
630 unitCell().nParameter(), kSize_);
631
632 hasKSq_ = true;
633 }
634
635 /*
636 * Compute derivatives of k^2 with respect to unit cell parameters.
637 */
638 template <int D>
639 void WaveList<D>::computedKSq()
640 {
641 if (hasdKSq_) return; // dKSq already calculated
642
643 // Compute minimum images if needed
644 if (!hasMinImages_) {
645 computeMinimumImages();
646 }
647
648 // Precondition
649 UTIL_CHECK(unitCell().nParameter() > 0);
650 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
651 UTIL_CHECK(unitCell().isInitialized());
652
653 // Calculate dkkBasis and store on device
654 int idx;
655 HostDArray<cudaReal> dkkBasis_h(unitCell().nParameter() * D * D);
656 DeviceArray<cudaReal> dkkBasis;
657 for(int i = 0 ; i < unitCell().nParameter(); ++i) {
658 for(int j = 0; j < D; ++j) {
659 for(int k = 0; k < D; ++k) {
660 idx = k + (j * D) + (i * D * D);
661 dkkBasis_h[idx] = unitCell().dkkBasis(i, j, k);
662 }
663 }
664 }
665 dkkBasis = dkkBasis_h;
666
667 // GPU resources
668 int nBlocks, nThreads;
669 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
670
671 // Kernel requires block size to be >= the size of dkkBasis.
672 // Max size of dkkBasis is 54, so this should always be satisfied
673 UTIL_CHECK(nThreads > dkkBasis.capacity());
674
675 // Launch kernel to calculate dKSq on device
676 size_t sz = sizeof(cudaReal)*dkkBasis.capacity();
677 _computedKSq<D><<<nBlocks, nThreads, sz>>>
678 (dKSq_.cArray(), minImages_.cArray(), dkkBasis.cArray(),
679 implicitInverse_.cArray(), unitCell().nParameter(), kSize_,
680 isRealField_);
681
682 hasdKSq_ = true;
683 }
684
685 /*
686 * Get minimum images on the host by reference.
687 *
688 * Gather data from device and re-arrange if necessary.
689 */
690 template <int D>
692 {
693 UTIL_CHECK(hasMinImages_);
694 if (!hasMinImages_h_) {
695 HostDArray<int> minImages_temp;
696 minImages_temp = minImages_;
697 int i, j, k;
698 for (j = 0; j < D; ++j) {
699 k = D*j;
700 for (i = 0; i < kSize_; ++i) {
701 minImages_h_[i][j] = minImages_temp[i + k];
702 }
703 }
704 hasMinImages_h_ = true;
705 }
706 return minImages_h_;
707 }
708
709} // Cuda
710} // Prdc
711} // Pscf
712#endif
Template for dynamic array stored in host CPU memory.
Definition HostDArray.h:43
WaveList(bool isRealField=true)
Constructor.
HostDArray< IntVec< D > > const & minImages_h() const
Get minimum images as IntVec<D> objects on the host.
static std::ostream & file()
Get log ostream by reference.
Definition Log.cpp:59
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
int nThreads()
Get the number of threads per block for execution.
int nBlocks()
Get the current number of blocks for execution.
dim3 meshDims()
Return last requested multidimensional grid of threads.
Fields, FFTs, and utilities for periodic boundary conditions (CUDA)
Definition Reduce.cpp:14
Periodic fields and crystallography.
Definition CField.cpp:11
PSCF package top-level namespace.
Definition param_pc.dox:1