PSCF v1.4.0
cuda/WaveList.tpp
1#ifndef PRDC_CUDA_WAVE_LIST_TPP
2#define PRDC_CUDA_WAVE_LIST_TPP
3
4/*
5* PSCF - Polymer Self-Consistent Field
6*
7* Copyright 2015 - 2025, The Regents of the University of Minnesota
8* Distributed under the terms of the GNU General Public License.
9*/
10
11#include "WaveList.h"
12
13#include <prdc/cuda/resources.h>
14#include <prdc/cuda/FFT.h>
15#include <prdc/crystal/UnitCell.h>
16#include <prdc/crystal/hasVariableAngle.h>
17#include <pscf/cuda/HostDArray.h>
18#include <pscf/mesh/Mesh.h>
19#include <pscf/mesh/MeshIterator.h>
20#include <pscf/math/Sort.h>
21#include <pscf/math/IntVec.h>
22
23namespace Pscf {
24namespace Prdc {
25namespace Cuda {
26
27 // CUDA kernels:
28 // (defined in an anonymous namespace, used only in this file)
29 namespace {
30
31 /*
32 * Compute minimum images of each wavevector, template declaration.
33 */
34 template <int D> __global__
35 void _computeMinimumImages(int* minImages,
36 cudaReal* kSq,
37 cudaReal const * kBasis,
38 int const * meshDims,
39 int const kSize);
40
41 /*
42 * Compute minimum images of each wavevector, 1D specialization.
43 *
44 * Kernel must be launched with kSize threads. (One thread calculates
45 * one minimum image.)
46 *
47 * When launched, this kernel requires no dynamic memory allocation.
48 */
49 template <> __global__
50 void _computeMinimumImages<1>(int* minImages,
51 cudaReal* kSq,
52 cudaReal const * kBasis,
53 int const * meshDims,
54 int const kSize)
55 {
56 unsigned int nThreads = blockDim.x * gridDim.x;
57 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
58
59 // Load kBasis and meshDims
60 cudaReal kBasis_ = kBasis[0];
61 int meshDims_ = meshDims[0];
62
63 for (int i = startID; i < kSize; i += nThreads) {
64 int img = minImages[i];
65
66 while (img > (meshDims_>>1)) { // note: x>>1 is same as x/2
67 img -= meshDims_;
68 }
69 while (img < -1*(meshDims_>>1)) {
70 img += meshDims_;
71 }
72
73 minImages[i] = img;
74 kSq[i] = img * img * kBasis_ * kBasis_;
75 }
76 }
77
78 /*
79 * Compute minimum images of each wavevector, 2D specialization.
80 *
81 * This kernel should be launched with >=32*kSize threads.
82 *
83 * 32 threads are used to calculate one minimum image. In the CPU code,
84 * we compare 25 different images of a wave to determine the minimum
85 * image, and here we choose 32 instead because it is a power of 2,
86 * allowing us to use a reduction algorithm to compare the images.
87 *
88 * When launched, this kernel requires dynamic memory allocation of
89 * (2*sizeof(int) + sizeof(cudaReal)) * nThreadsPerBlock.
90 */
91 template <> __global__
92 void _computeMinimumImages<2>(int* minImages,
93 cudaReal* kSq,
94 cudaReal const * kBasis,
95 int const * meshDims,
96 int const kSize)
97 {
98 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
99
100 // Determine which lattice parameter and which image to evaluate
101 unsigned int paramID = (startID >> 5); // equivalent to startID / 32
102 unsigned int imageID = startID - (paramID * 32); // value in [0, 31]
103
104 // Determine the image that will be evaluated by this thread
105 // (uses integer division, not ideal for speed)
106 int s0, s1;
107 s0 = imageID / 5;
108 s1 = imageID - (s0 * 5);
109 s0 -= 2; // shift values to go from -2 to 4, not 0 to 6
110 s1 -= 2; // shift values to go from -2 to 2, not 0 to 4
111
112 // Set epsilon
113 #ifdef SINGLE_PRECISION
114 cudaReal epsilon = 1.0E-4;
115 #else
116 cudaReal epsilon = 1.0E-8;
117 #endif
118
119 // Declare dynamically allocated arrays in shared memory
120 // (allocated space is shared between cudaReal array and int array)
121 extern __shared__ cudaReal kSqVals_[];
122 int* images_ = (int*)&kSqVals_[blockDim.x];
123
124 if (paramID < kSize) { // only evaluate if on the k-grid
125
126 // Load image from global memory and shift based on s0 and s1
127 images_[2*threadIdx.x] = minImages[paramID] + (s0*meshDims[0]);
128 images_[2*threadIdx.x+1] = minImages[kSize + paramID]
129 + (s1 * meshDims[1]);
130
131 // Calculate kSq for this wave
132 cudaReal kVec0 = (kBasis[0] * images_[2*threadIdx.x]) +
133 (kBasis[2] * images_[2*threadIdx.x+1]);
134 cudaReal kVec1 = (kBasis[1] * images_[2*threadIdx.x]) +
135 (kBasis[3] * images_[2*threadIdx.x+1]);
136
137 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1);
138 }
139 __syncthreads(); // wait for all threads to finish
140
141 // Perform a parallel reduction on 32 threads to find min image
142 // (note: stride >>= 1 is equivalent to stride /= 2)
143 for (int stride = 16; stride > 0; stride >>= 1) {
144 if (paramID < kSize) { // only evaluate if on the k-grid
145 if (imageID < stride) {
146 bool swap = false;
147 if (kSqVals_[threadIdx.x+stride] <
148 (kSqVals_[threadIdx.x] - epsilon)) {
149 swap = true;
150 } else if (kSqVals_[threadIdx.x+stride] <
151 (kSqVals_[threadIdx.x] + epsilon)) {
152 // kSq values effectively equal.
153 // Determine whether to swap based on hkl indices
154 for (int i = 0; i < 2; ++i) {
155 if (images_[2*threadIdx.x+i] >
156 images_[2*(threadIdx.x+stride)+i]) {
157 break;
158 } else
159 if (images_[2*threadIdx.x+i] <
160 images_[2*(threadIdx.x+stride)+i]) {
161 swap = true;
162 break;
163 }
164 }
165 }
166 if (swap) {
167 images_[2*threadIdx.x] =
168 images_[2*(threadIdx.x+stride)];
169 images_[2*threadIdx.x+1] =
170 images_[2*(threadIdx.x+stride)+1];
171 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
172 }
173 }
174 }
175 // Note: no __syncthreads() needed here because this reduction
176 // occurswithin 1 warp, so all threads are already synced)
177 }
178
179 // At this point, for any thread with imageID == 0, the
180 // corresponding entries in kSqVals_ and images_ should contain
181 // the kSq value and the minimum image, respectively. Store
182 // values and exit
183 if ((imageID == 0) && (paramID < kSize)) {
184 kSq[paramID] = kSqVals_[threadIdx.x];
185 minImages[paramID] = images_[2*threadIdx.x];
186 minImages[paramID + kSize] = images_[2*threadIdx.x+1];
187 }
188 }
189
190 /*
191 * Compute minimum images of each wavevector, 3D specialization.
192 *
193 * This kernel should be launched with >=128*kSize threads.
194 *
195 * 128 threads are used to calculate one minimum image. In the CPU
196 * code, we compare 125 different images of a wave to determine the
197 * minimum image, and here we choose 128 instead because it is a
198 * power of 2, allowing us to use a reduction algorithm to compare
199 * the images.
200 *
201 * When launched, this kernel requires dynamic memory allocation of
202 * (3*sizeof(int) + sizeof(cudaReal)) * nThreadsPerBlock.
203 */
204 template <> __global__
205 void _computeMinimumImages<3>(int* minImages,
206 cudaReal* kSq,
207 cudaReal const * kBasis,
208 int const * meshDims,
209 int const kSize)
210 {
211 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
212
213 // Determine which lattice parameter and which image to evaluate
214 unsigned int paramID = (startID >> 7); // equivalent to startID / 128
215 unsigned int imageID = startID - (paramID * 128);
216
217 // Determine the image that will be evaluated by this thread
218 // (uses integer division, not ideal for speed)
219 int s0, s1, s2;
220 s0 = imageID / 25;
221 s1 = (imageID - (s0 * 25)) / 5;
222 s2 = imageID - (s0 * 25) - (s1 * 5);
223 s0 -= 2; // shift values to go from -2 to 2
224 s1 -= 2;
225 s2 -= 2;
226
227 // Set epsilon
228 #ifdef SINGLE_PRECISION
229 cudaReal epsilon = 1.0E-4;
230 #else
231 cudaReal epsilon = 1.0E-8;
232 #endif
233
234 // Declare dynamically allocated arrays in shared memory
235 // (allocated space is shared between cudaReal array and int array)
236 extern __shared__ cudaReal kSqVals_[];
237 int* images_ = (int*)&kSqVals_[blockDim.x];
238
239 if (paramID < kSize) { // only evaluate if on the k-grid
240
241 // Load image data from global memory
242 images_[3*threadIdx.x] = minImages[paramID] + (s0 * meshDims[0]);
243 images_[3*threadIdx.x+1] = minImages[kSize + paramID] +
244 (s1 * meshDims[1]);
245 images_[3*threadIdx.x+2] = minImages[kSize + kSize + paramID] +
246 (s2 * meshDims[2]);
247
248 // Calculate kSq for this wave
249 cudaReal kVec0(0.0), kVec1(0.0), kVec2(0.0);
250 for (int k = 0; k < 3; ++k) {
251 kVec0 += kBasis[3*k] * images_[3*threadIdx.x + k];
252 kVec1 += kBasis[3*k+1] * images_[3*threadIdx.x + k];
253 kVec2 += kBasis[3*k+2] * images_[3*threadIdx.x + k];
254 }
255
256 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1) +
257 (kVec2*kVec2);
258 }
259 __syncthreads(); // wait for all threads to finish
260
261 // Perform a parallel reduction on 128 threads to find min image
262 // (note: stride >>= 1 is equivalent to stride /= 2)
263 for (int stride = 64; stride > 0; stride >>= 1) {
264 if (paramID < kSize) { // only evaluate if on the k-grid
265 if (imageID < stride) {
266 bool swap = false;
267 if (kSqVals_[threadIdx.x+stride] <
268 (kSqVals_[threadIdx.x] - epsilon)) {
269 swap = true;
270 } else if (kSqVals_[threadIdx.x+stride] <
271 (kSqVals_[threadIdx.x] + epsilon)) {
272 // kSq values effectively equal.
273 // Determine whether to swap based on hkl indices
274 for (int i = 0; i < 3; ++i) {
275 if (images_[3*threadIdx.x+i] >
276 images_[3*(threadIdx.x+stride)+i]) {
277 break;
278 } else
279 if (images_[3*threadIdx.x+i] <
280 images_[3*(threadIdx.x+stride)+i]) {
281 swap = true;
282 break;
283 }
284 }
285 }
286 if (swap) {
287 images_[3*threadIdx.x] =
288 images_[3*(threadIdx.x+stride)];
289 images_[3*threadIdx.x+1] =
290 images_[3*(threadIdx.x+stride)+1];
291 images_[3*threadIdx.x+2] =
292 images_[3*(threadIdx.x+stride)+2];
293 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
294 }
295 }
296 }
297 __syncthreads(); // wait for all threads to finish
298 }
299
300 // At this point, for any thread with imageID == 0, the
301 // corresponding entries in kSqVals_ and images_ should
302 // contain the kSq value and the minimum image, respectively.
303 // Store values and exit
304 if ((imageID == 0) && (paramID < kSize)) {
305 kSq[paramID] = kSqVals_[threadIdx.x];
306 minImages[paramID] = images_[3*threadIdx.x];
307 minImages[paramID + kSize] = images_[3*threadIdx.x+1];
308 minImages[paramID + kSize + kSize] = images_[3*threadIdx.x+2];
309 }
310 }
311
312 /*
313 * Compute the kSq array on the GPU.
314 */
315 template <int D> __global__
316 void _computeKSq(cudaReal* kSq,
317 int const * waveBz,
318 cudaReal const * kBasis,
319 int const nParams,
320 int const kSize)
321 {
322 int nThreads = blockDim.x * gridDim.x;
323 int startID = blockIdx.x * blockDim.x + threadIdx.x;
324
325 // Load kBasis into shared memory for fast access
326 __shared__ cudaReal kBasis_[D*D];
327 if (threadIdx.x < D * D) {
328 kBasis_[threadIdx.x] = kBasis[threadIdx.x];
329 }
330 __syncthreads(); // wait for all threads to finish
331
332 // Variables to be used in the loop
333 int i, j, k, waveBz_;
334 cudaReal kVec[D], kSqVal;
335 // Note: usually local arrays are very slow in a CUDA kernel, but
336 // the compiler should ideally be able to unroll the loops below
337 // and store the kVec array in the register, because the full
338 // structure of the loops below is known at compile time.
339
340 // Loop through array
341 for (i = startID; i < kSize; i += nThreads) {
342
343 // initialize kVec to 0
344 for (j = 0; j < D; ++j) {
345 kVec[j] = 0.0;
346 }
347
348 // Calculate kVec
349 for (j = 0; j < D; ++j) {
350 waveBz_ = waveBz[i + (j * kSize)];
351 for (k = 0; k < D; ++k) {
352 kVec[k] += kBasis_[k + (D*j)] * waveBz_;
353 }
354 }
355
356 // Compute kSq
357 kSqVal = 0.0;
358 for (j = 0; j < D; ++j) {
359 kSqVal += kVec[j] * kVec[j];
360 }
361
362 // Store value in global memory
363 kSq[i] = kSqVal;
364
365 } // kSize
366 }
367
368 /*
369 * Compute the dKSq array on the GPU.
370 *
371 * The implicitInverse array is only used if isRealField is true.
372 * If isRealField is false, implicitInverse may be a null pointer.
373 */
374 template <int D> __global__
375 void _computedKSq(cudaReal* dKSq,
376 int const * waveBz,
377 cudaReal const * dkkBasis,
378 bool const * implicitInverse,
379 int const nParams,
380 int const kSize,
381 bool isRealField)
382 {
383 // Size of dKSq is kSize * nParams
384 // Each thread does nParams calculations
385 int nThreads = blockDim.x * gridDim.x;
386 int startID = blockIdx.x * blockDim.x + threadIdx.x;
387
388 // Load dkkBasis into shared memory for fast access
389 // (max size of dkkBasis is 54 elements for triclinic unit cell)
390 extern __shared__ cudaReal dkkBasis_[];
391 if (threadIdx.x < nParams * D * D) {
392 dkkBasis_[threadIdx.x] = dkkBasis[threadIdx.x];
393 }
394 __syncthreads(); // wait for all threads to finish
395
396 // Variables to be used in the loop
397 int param, i, j, k, waveBz_[D];
398 cudaReal dKSqVal;
399
400 // Loop through array
401 for (param = 0; param < nParams; ++param) {
402 for (i = startID; i < kSize; i += nThreads) {
403
404 // initialize to 0
405 dKSqVal = 0.0;
406
407 // Load waveBz to local memory
408 for (j = 0; j < D; ++j) {
409 waveBz_[j] = waveBz[i + (j * kSize)];
410 }
411
412 // Compute dKSq
413 for (j = 0; j < D; ++j) {
414 for (k = 0; k < D; ++k) {
415 dKSqVal += waveBz_[j] * waveBz_[k]
416 * dkkBasis_[k + (j * D) + (param * D * D)];
417 } // D
418 } // D
419
420 if (isRealField) {
421 if (implicitInverse[i]) {
422 // if element i's inverse is implicit
423 dKSqVal *= 2;
424 }
425 }
426
427 dKSq[(param * kSize) + i] = dKSqVal;
428
429 } // kSize
430 } // nParams
431 }
432
433 } // anonymous namespace Pscf::Prdc::(unnamed)
434
435 // Member function definitions
436
437 /*
438 * Constructor.
439 */
440 template <int D>
442 : kSize_(0),
443 nBunch_(0),
444 isAllocated_(false),
445 hasMinImages_(false),
446 hasMinImages_h_(false),
447 hasKSq_(false),
448 hasdKSq_(false),
449 isSorted_(false),
450 isRealField_(isRealField),
451 unitCellPtr_(nullptr),
452 meshPtr_(nullptr)
453 {}
454
455 /*
456 * Destructor.
457 */
458 template <int D>
460 {
461
462 if (dKSqSlices_.isAllocated()) {
463 int nParams = dKSqSlices_.capacity();
464 for (int i = 0; i < nParams; i++) {
465 if (dKSqSlices_[i].isAssociated()) {
466 dKSqSlices_[i].dissociate();
467 }
468 }
469 }
470
471 /*
472 * The above loop dissociates elements of dKSqSlices_ from memory
473 * owned by dKSq_. Because the destructor body is called before
474 * destructors for members, this operation will occur before either
475 * container is destroyed. Strictly speaking, this is a redundant
476 * safety measure as long as dKSqSlices_ appears in the declaration
477 * of members after dKSq_, because this order causes dKSqSlices_
478 * to to be destroyed before dKSq_, and the destructor for each
479 * element of dKSqSlices_ would also call dissociate() if the
480 * association still existed at that point.
481 */
482
483 }
484
485 /*
486 * Allocate memory.
487 *
488 * This function allocates and construct implicitInverse_ if and
489 * only if isRealField_ == true.
490 */
491 template <int D>
492 void WaveList<D>::allocate(Mesh<D> const & m, UnitCell<D> const & c)
493 {
494 UTIL_CHECK(m.size() > 0);
495 UTIL_CHECK(c.nParameter() > 0);
496 UTIL_CHECK(!isAllocated_);
497
498 // Create permanent associations with mesh and unit cell
499 unitCellPtr_ = &c;
500 meshPtr_ = &m;
501
502 int nParams = unitCell().nParameter();
503 IntVec<D> const & meshDimensions = mesh().dimensions();
504
505 // Compute kMeshDimensions_ and kSize_
506 if (isRealField_) {
507 FFT<D>::computeKMesh(meshDimensions, kMeshDimensions_, kSize_);
508 } else {
509 kMeshDimensions_ = meshDimensions;
510 kSize_ = mesh().size();
511 }
512 UTIL_CHECK(kSize_ > 0);
513
514 minImages_.allocate(kSize_ * D);
515 kSq_.allocate(kMeshDimensions_);
516 dKSq_.allocate(kSize_ * nParams);
517 dKSqSlices_.allocate(nParams);
518 for (int i = 0; i < nParams; i++) {
519 dKSqSlices_[i].associate(dKSq_, i*kSize_, kMeshDimensions_);
520 }
521
522 // Allocate and set up implicitInverse_ array if isRealField_ == true
523 // (only depends on mesh dimensions, only used for real fields)
524 if (isRealField_) {
525 implicitInverse_.allocate(kSize_);
526 MeshIterator<D> kItr(kMeshDimensions_);
527 HostDArray<bool> implicitInverse_h(kSize_);
528 int inverseId;
529 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
530 if (kItr.position(D-1) == 0) {
531 inverseId = 0;
532 } else {
533 inverseId = mesh().dimension(D-1) - kItr.position(D-1);
534 }
535 if (inverseId >= kMeshDimensions_[D-1]) {
536 implicitInverse_h[kItr.rank()] = true;
537 } else {
538 implicitInverse_h[kItr.rank()] = false;
539 }
540 }
541 implicitInverse_ = implicitInverse_h; // transfer to device memory
542 }
543
545 isAllocated_ = true;
546 }
547
548 /*
549 * Clear data that depends on unit cell parameters.
550 */
551 template <int D>
553 {
554 hasKSq_ = false;
555 hasdKSq_ = false;
556 if (unitCell().nParameter() > 1) {
557 isSorted_ = false;
558 sortedBunches_.clear();
559 nBunch_ = 0;
560 }
561 if (hasVariableAngle<D>(unitCell().lattice())) {
562 hasMinImages_ = false;
563 hasMinImages_h_ = false;
564 }
565 }
566
567 /*
568 * Compute minimum image vectors and kSq.
569 */
570 template <int D>
572 {
573 // If min images are valid, return immediately
574 if (hasMinImages_) return;
575
576 // Precondition
577 UTIL_CHECK(isAllocated_);
578 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
579 UTIL_CHECK(unitCell().isInitialized());
580 UTIL_CHECK(minImages_.capacity() == kSize_ * D);
581
582 // Set initial array of images to contain the k-grid points
583 HostDArray<int> imagesTmp(D*kSize_);
584 MeshIterator<D> kItr(kMeshDimensions_);
585 for (int i = 0; i < D; i++) {
586 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
587 imagesTmp[kItr.rank() + (i*kSize_)] = kItr.position(i);
588 }
589 }
590 minImages_ = imagesTmp; // copy to device
591
592 // Get kBasis and meshDims and store on device
593 HostDArray<cudaReal> kBasis_h(D*D);
594 HostDArray<int> meshDims_h(D);
595 DeviceArray<cudaReal> kBasis(D*D);
596 DeviceArray<int> meshDims(D);
597 int idx = 0;
598 for (int j = 0; j < D; ++j) {
599 for (int k = 0; k < D; ++k) {
600 kBasis_h[idx] = unitCell().kBasis(j)[k];
601 idx++;
602 }
603 meshDims_h[j] = mesh().dimension(j);
604 }
605 kBasis = kBasis_h;
606 meshDims = meshDims_h;
607
608 // Set number of threads per gridpoint (depends on D)
609 int threadsPerGP;
610 if (D == 3) {
611 threadsPerGP = 128;
612 } else if (D == 2) {
613 threadsPerGP = 32;
614 } else if (D == 1) {
615 threadsPerGP = 1;
616 }
617
618 // GPU resources
619 int nBlocks, nThreads;
620 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
621
622 if ((D == 3) && (nThreads < 128)) {
623 // Thread blocks too small. Manually set nThreads to 128
625 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
626
627 // If the above was successful, print warning
628 Log::file() << "Warning: "
629 << "nThreads too small for computeMinimumImages.\n"
630 << "Setting nThreads equal to 128." << std::endl;
631 }
632
633 UTIL_CHECK(kSq_.isAllocated());
634 UTIL_CHECK(minImages_.isAllocated());
635
636 // Launch kernel
637 size_t sz = (D * sizeof(int) + sizeof(cudaReal)) * nThreads;
638 _computeMinimumImages<D><<<nBlocks, nThreads, sz>>>
639 (minImages_.cArray(), kSq_.cArray(), kBasis.cArray(),
640 meshDims.cArray(), kSize_);
641
642 hasMinImages_ = true;
643 hasKSq_ = true;
644 }
645
646 /*
647 * Compute values of k^2, using existing minImages if possible.
648 */
649 template <int D>
651 {
652 // If kSq values are valid, return immediately without recomputing
653 if (hasKSq_) return;
654
655 // If necessary, compute minimum images
656 if (!hasMinImages_) {
657 computeMinimumImages(); // compute both min images and kSq
658 return;
659 }
660
661 // If this point is reached, calculate kSq using _computeKSq kernel
662
663 // Preconditions
664 UTIL_CHECK(unitCell().nParameter() > 0);
665 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
666 UTIL_CHECK(unitCell().isInitialized());
667 UTIL_CHECK(isAllocated_);
668
669 // Get kBasis and store on device
670 HostDArray<cudaReal> kBasis_h(D*D);
671 DeviceArray<cudaReal> kBasis(D*D);
672 int idx = 0;
673 for (int j = 0; j < D; ++j) {
674 for (int k = 0; k < D; ++k) {
675 kBasis_h[idx] = unitCell().kBasis(j)[k];
676 idx++;
677 }
678 }
679 kBasis = kBasis_h;
680
681 // GPU resources
682 int nBlocks, nThreads;
683 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
684
685 // Launch kernel to calculate kSq on device
686 UTIL_CHECK(kSq_.isAllocated());
687 UTIL_CHECK(minImages_.isAllocated());
688 _computeKSq<D><<<nBlocks, nThreads>>>
689 (kSq_.cArray(), minImages_.cArray(), kBasis.cArray(),
690 unitCell().nParameter(), kSize_);
691
692 hasKSq_ = true;
693 }
694
695 /*
696 * Compute derivatives of k^2 with respect to unit cell parameters.
697 */
698 template <int D>
700 {
701 if (hasdKSq_) return; // dKSq already calculated
702
703 // Compute minimum images if needed
704 if (!hasMinImages_) {
706 }
707
708 // Preconditions
709 UTIL_CHECK(unitCell().nParameter() > 0);
710 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
711 UTIL_CHECK(unitCell().isInitialized());
712 UTIL_CHECK(isAllocated_);
713
714 // Calculate dkkBasis and store on device
715 int idx;
716 HostDArray<cudaReal> dkkBasis_h(unitCell().nParameter() * D * D);
717 DeviceArray<cudaReal> dkkBasis;
718 for (int i = 0 ; i < unitCell().nParameter(); ++i) {
719 for (int j = 0; j < D; ++j) {
720 for (int k = 0; k < D; ++k) {
721 idx = k + (j * D) + (i * D * D);
722 dkkBasis_h[idx] = unitCell().dkkBasis(i, j, k);
723 }
724 }
725 }
726 dkkBasis = dkkBasis_h;
727
728 // GPU resources
729 int nBlocks, nThreads;
730 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
731
732 // Kernel requires block size to be >= the size of dkkBasis.
733 // Max size of dkkBasis is 54, so this should always be satisfied
734 UTIL_CHECK(nThreads > dkkBasis.capacity());
735
736 // Preconditions for kernel
737 UTIL_CHECK(dKSq_.isAllocated());
738 UTIL_CHECK(minImages_.isAllocated());
739 bool const * implicitInversePtr = nullptr;
740 if (isRealField_) {
741 UTIL_CHECK(implicitInverse_.isAllocated());
742 implicitInversePtr = implicitInverse_.cArray();
743 }
744
745 // Launch kernel to calculate dKSq on device
746 size_t sz = sizeof(cudaReal)*dkkBasis.capacity();
747 _computedKSq<D><<<nBlocks, nThreads, sz>>>
748 (dKSq_.cArray(), minImages_.cArray(), dkkBasis.cArray(),
749 implicitInversePtr, unitCell().nParameter(), kSize_,
750 isRealField_);
751
752 hasdKSq_ = true;
753 }
754
755 /*
756 * Sort waves by magnitude.
757 */
758 template <int D>
760 {
761 // If waves are already sorted, return immediately
762 if (isSorted_) return;
763
764 // Compute wavenumbers if necessary
765 UTIL_CHECK(isAllocated_);
766 if (!hasKSq_) {
767 computeKSq();
768 }
769
770 // Copy values of kSq to host
771 HostDArray<cudaReal> kSq_h = kSq_;
772
773 // Construct Sort::Item objects with value = kSq, id = wave id
774 std::vector< Sort::Item<double> > items;
776 for (int i = 0; i < kSize_; ++i) {
777 item.value = kSq_h[i];
778 item.id = i;
779 items.push_back(item);
780 }
781 UTIL_CHECK((int)items.size() == kSize_);
782
783 // Sort array of items
784 Sort::sort(items);
785
786 // Fill sortedIds_ member array with sorted ids
787 if (!sortedIds_.isAllocated()) {
788 sortedIds_.allocate(kSize_);
789 }
790 UTIL_CHECK(sortedIds_.capacity() == kSize_);
791 for (int i = 0; i < kSize_; ++i) {
792 sortedIds_[i] = items[i].id;
793 }
794
795 // Construct sortedBunches_ array and set nBunch_
796 double epsilon = 1.0E-8;
797 sortedBunches_.clear();
798 Sort::findBunches(items, sortedBunches_, epsilon);
799 nBunch_ = sortedBunches_.size();
800
801 // Construct bunchIds_ array
802 UTIL_CHECK(nBunch_ > 0);
803 if (!bunchIds_.isAllocated()) {
804 bunchIds_.allocate(kSize_);
805 }
806 UTIL_CHECK(bunchIds_.capacity() == kSize_);
807 int begin, end, ib, iw;
808 for (ib = 0; ib < nBunch_; ++ib) {
809 begin = sortedBunches_[ib][0];
810 end = sortedBunches_[ib][1];
811 UTIL_CHECK(end > begin);
812 for (iw = begin; iw < end; ++iw) {
813 bunchIds_[sortedIds_[iw]] = ib;
814 }
815 }
816 UTIL_CHECK(end == kSize_);
817
818 isSorted_ = true;
819 }
820
821 /*
822 * Get minimum images on the host by reference.
823 *
824 * Gather data from device and re-arrange if necessary.
825 */
826 template <int D>
828 {
829 UTIL_CHECK(hasMinImages_);
830 if (!hasMinImages_h_) {
831 HostDArray<int> minImages_temp;
832 minImages_temp = minImages_;
833 int i, j, k;
834 for (j = 0; j < D; ++j) {
835 k = D*j;
836 for (i = 0; i < kSize_; ++i) {
837 minImages_h_[i][j] = minImages_temp[i + k];
838 }
839 }
840 hasMinImages_h_ = true;
841 }
842 return minImages_h_;
843 }
844
845} // Cuda
846} // Prdc
847} // Pscf
848#endif
Dynamic array on the GPU device with aligned data.
Definition DeviceArray.h:96
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Definition HostDArray.h:41
An IntVec<D, T> is a D-component vector of elements of integer type T.
Definition IntVec.h:27
Iterator over points in a Mesh<D>.
int rank() const
Get the rank of current element.
void begin()
Set iterator to the first point in the mesh.
bool atEnd() const
Is this the end (i.e., one past the last point)?
IntVec< D > position() const
Get current position in the grid, as integer vector.
Description of a regular grid of points in a periodic domain.
Definition Mesh.h:61
int size() const
Get total number of grid points.
Definition Mesh.h:229
static void computeKMesh(IntVec< D > const &rMeshDimensions, IntVec< D > &kMeshDimensions, int &kSize)
Compute dimensions and size of k-space mesh for DFT of real data.
Definition cuda/FFT.tpp:278
void sortWaves()
Sort waves in order of ascending wavevector norm.
void clearUnitCellData()
Clear all internal data that depends on lattice parameters.
void computedKSq()
Compute derivatives of |k|^2 w/ respect to unit cell parameters.
WaveList(bool isRealField=true)
Constructor.
void computeMinimumImages()
Compute minimum images of wavevectors, and also calculate kSq.
void allocate(Mesh< D > const &m, UnitCell< D > const &c)
Allocate memory and set association with a Mesh and UnitCell object.
void computeKSq()
Compute square norm |k|^2 for all wavevectors.
bool isRealField() const
Is this WaveList set up for use with real-valued fields?
HostDArray< IntVec< D > > const & minImages_h() const
Get minimum images as IntVec<D> objects on the host.
int nParameter() const
Get the number of parameters in the unit cell.
Base template for UnitCell<D> classes, D=1, 2 or 3.
Definition UnitCell.h:56
static std::ostream & file()
Get log ostream by reference.
Definition Log.cpp:59
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition global.h:68
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
dim3 meshDims()
Return last requested multidimensional grid of threads.
void findBunches(std::vector< Item< T > > const &items, GArray< Bunch > &bunches, T epsilon)
Identify "bunches" of equal values within a sorted vector.
Definition Sort.cpp:29
void sort(std::vector< Item< T > > &items)
Sort a std::vector< Item<T> > by ascending item value.
Definition Sort.cpp:22
Fields, FFTs, and utilities for periodic boundary conditions (CUDA).
Definition CField.cu:12
Periodic fields and crystallography.
Definition complex.cpp:11
bool hasVariableAngle(typename UnitCell< D >::LatticeSystem lattice)
Return true if lattice type has variable angle parameters.
PSCF package top-level namespace.
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.
Definition cudaTypes.h:35
Struct with value and index, to keep track of permutation.
Definition Sort.h:37