1#ifndef PRDC_CUDA_WAVE_LIST_TPP
2#define PRDC_CUDA_WAVE_LIST_TPP
13#include <prdc/cuda/resources.h>
14#include <prdc/cuda/FFT.h>
15#include <prdc/crystal/UnitCell.h>
16#include <prdc/crystal/hasVariableAngle.h>
17#include <pscf/cuda/HostDArray.h>
18#include <pscf/mesh/Mesh.h>
19#include <pscf/mesh/MeshIterator.h>
20#include <pscf/math/Sort.h>
21#include <pscf/math/IntVec.h>
34 template <
int D> __global__
35 void _computeMinimumImages(
int* minImages,
49 template <> __global__
50 void _computeMinimumImages<1>(
int* minImages,
56 unsigned int nThreads = blockDim.x * gridDim.x;
57 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
63 for (
int i = startID; i < kSize; i +=
nThreads) {
64 int img = minImages[i];
66 while (img > (meshDims_>>1)) {
69 while (img < -1*(meshDims_>>1)) {
74 kSq[i] = img * img * kBasis_ * kBasis_;
91 template <> __global__
92 void _computeMinimumImages<2>(
int* minImages,
98 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
101 unsigned int paramID = (startID >> 5);
102 unsigned int imageID = startID - (paramID * 32);
108 s1 = imageID - (s0 * 5);
113 #ifdef SINGLE_PRECISION
121 extern __shared__
cudaReal kSqVals_[];
122 int* images_ = (
int*)&kSqVals_[blockDim.x];
124 if (paramID < kSize) {
127 images_[2*threadIdx.x] = minImages[paramID] + (s0*
meshDims[0]);
128 images_[2*threadIdx.x+1] = minImages[kSize + paramID]
132 cudaReal kVec0 = (kBasis[0] * images_[2*threadIdx.x]) +
133 (kBasis[2] * images_[2*threadIdx.x+1]);
134 cudaReal kVec1 = (kBasis[1] * images_[2*threadIdx.x]) +
135 (kBasis[3] * images_[2*threadIdx.x+1]);
137 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1);
143 for (
int stride = 16; stride > 0; stride >>= 1) {
144 if (paramID < kSize) {
145 if (imageID < stride) {
147 if (kSqVals_[threadIdx.x+stride] <
148 (kSqVals_[threadIdx.x] - epsilon)) {
150 }
else if (kSqVals_[threadIdx.x+stride] <
151 (kSqVals_[threadIdx.x] + epsilon)) {
154 for (
int i = 0; i < 2; ++i) {
155 if (images_[2*threadIdx.x+i] >
156 images_[2*(threadIdx.x+stride)+i]) {
159 if (images_[2*threadIdx.x+i] <
160 images_[2*(threadIdx.x+stride)+i]) {
167 images_[2*threadIdx.x] =
168 images_[2*(threadIdx.x+stride)];
169 images_[2*threadIdx.x+1] =
170 images_[2*(threadIdx.x+stride)+1];
171 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
183 if ((imageID == 0) && (paramID < kSize)) {
184 kSq[paramID] = kSqVals_[threadIdx.x];
185 minImages[paramID] = images_[2*threadIdx.x];
186 minImages[paramID + kSize] = images_[2*threadIdx.x+1];
204 template <> __global__
205 void _computeMinimumImages<3>(
int* minImages,
208 int const * meshDims,
211 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
214 unsigned int paramID = (startID >> 7);
215 unsigned int imageID = startID - (paramID * 128);
221 s1 = (imageID - (s0 * 25)) / 5;
222 s2 = imageID - (s0 * 25) - (s1 * 5);
228 #ifdef SINGLE_PRECISION
236 extern __shared__
cudaReal kSqVals_[];
237 int* images_ = (
int*)&kSqVals_[blockDim.x];
239 if (paramID < kSize) {
242 images_[3*threadIdx.x] = minImages[paramID] + (s0 *
meshDims[0]);
243 images_[3*threadIdx.x+1] = minImages[kSize + paramID] +
245 images_[3*threadIdx.x+2] = minImages[kSize + kSize + paramID] +
249 cudaReal kVec0(0.0), kVec1(0.0), kVec2(0.0);
250 for (
int k = 0; k < 3; ++k) {
251 kVec0 += kBasis[3*k] * images_[3*threadIdx.x + k];
252 kVec1 += kBasis[3*k+1] * images_[3*threadIdx.x + k];
253 kVec2 += kBasis[3*k+2] * images_[3*threadIdx.x + k];
256 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1) +
263 for (
int stride = 64; stride > 0; stride >>= 1) {
264 if (paramID < kSize) {
265 if (imageID < stride) {
267 if (kSqVals_[threadIdx.x+stride] <
268 (kSqVals_[threadIdx.x] - epsilon)) {
270 }
else if (kSqVals_[threadIdx.x+stride] <
271 (kSqVals_[threadIdx.x] + epsilon)) {
274 for (
int i = 0; i < 3; ++i) {
275 if (images_[3*threadIdx.x+i] >
276 images_[3*(threadIdx.x+stride)+i]) {
279 if (images_[3*threadIdx.x+i] <
280 images_[3*(threadIdx.x+stride)+i]) {
287 images_[3*threadIdx.x] =
288 images_[3*(threadIdx.x+stride)];
289 images_[3*threadIdx.x+1] =
290 images_[3*(threadIdx.x+stride)+1];
291 images_[3*threadIdx.x+2] =
292 images_[3*(threadIdx.x+stride)+2];
293 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
304 if ((imageID == 0) && (paramID < kSize)) {
305 kSq[paramID] = kSqVals_[threadIdx.x];
306 minImages[paramID] = images_[3*threadIdx.x];
307 minImages[paramID + kSize] = images_[3*threadIdx.x+1];
308 minImages[paramID + kSize + kSize] = images_[3*threadIdx.x+2];
315 template <
int D> __global__
322 int nThreads = blockDim.x * gridDim.x;
323 int startID = blockIdx.x * blockDim.x + threadIdx.x;
327 if (threadIdx.x < D * D) {
328 kBasis_[threadIdx.x] = kBasis[threadIdx.x];
333 int i, j, k, waveBz_;
341 for (i = startID; i < kSize; i +=
nThreads) {
344 for (j = 0; j < D; ++j) {
349 for (j = 0; j < D; ++j) {
350 waveBz_ = waveBz[i + (j * kSize)];
351 for (k = 0; k < D; ++k) {
352 kVec[k] += kBasis_[k + (D*j)] * waveBz_;
358 for (j = 0; j < D; ++j) {
359 kSqVal += kVec[j] * kVec[j];
374 template <
int D> __global__
378 bool const * implicitInverse,
385 int nThreads = blockDim.x * gridDim.x;
386 int startID = blockIdx.x * blockDim.x + threadIdx.x;
390 extern __shared__
cudaReal dkkBasis_[];
391 if (threadIdx.x < nParams * D * D) {
392 dkkBasis_[threadIdx.x] = dkkBasis[threadIdx.x];
397 int param, i, j, k, waveBz_[D];
401 for (param = 0; param < nParams; ++param) {
402 for (i = startID; i < kSize; i +=
nThreads) {
408 for (j = 0; j < D; ++j) {
409 waveBz_[j] = waveBz[i + (j * kSize)];
413 for (j = 0; j < D; ++j) {
414 for (k = 0; k < D; ++k) {
415 dKSqVal += waveBz_[j] * waveBz_[k]
416 * dkkBasis_[k + (j * D) + (param * D * D)];
421 if (implicitInverse[i]) {
427 dKSq[(param * kSize) + i] = dKSqVal;
445 hasMinImages_(false),
446 hasMinImages_h_(false),
451 unitCellPtr_(nullptr),
462 if (dKSqSlices_.isAllocated()) {
463 int nParams = dKSqSlices_.capacity();
464 for (
int i = 0; i < nParams; i++) {
465 if (dKSqSlices_[i].isAssociated()) {
466 dKSqSlices_[i].dissociate();
502 int nParams = unitCell().nParameter();
503 IntVec<D> const & meshDimensions = mesh().dimensions();
509 kMeshDimensions_ = meshDimensions;
510 kSize_ = mesh().size();
514 minImages_.allocate(kSize_ * D);
515 kSq_.allocate(kMeshDimensions_);
516 dKSq_.allocate(kSize_ * nParams);
517 dKSqSlices_.allocate(nParams);
518 for (
int i = 0; i < nParams; i++) {
519 dKSqSlices_[i].associate(dKSq_, i*kSize_, kMeshDimensions_);
525 implicitInverse_.allocate(kSize_);
533 inverseId = mesh().dimension(D-1) - kItr.
position(D-1);
535 if (inverseId >= kMeshDimensions_[D-1]) {
536 implicitInverse_h[kItr.
rank()] =
true;
538 implicitInverse_h[kItr.
rank()] =
false;
541 implicitInverse_ = implicitInverse_h;
556 if (unitCell().nParameter() > 1) {
558 sortedBunches_.clear();
562 hasMinImages_ =
false;
563 hasMinImages_h_ =
false;
574 if (hasMinImages_)
return;
580 UTIL_CHECK(minImages_.capacity() == kSize_ * D);
585 for (
int i = 0; i < D; i++) {
590 minImages_ = imagesTmp;
598 for (
int j = 0; j < D; ++j) {
599 for (
int k = 0; k < D; ++k) {
600 kBasis_h[idx] = unitCell().kBasis(j)[k];
603 meshDims_h[j] = mesh().dimension(j);
606 meshDims = meshDims_h;
619 int nBlocks, nThreads;
622 if ((D == 3) && (nThreads < 128)) {
629 <<
"nThreads too small for computeMinimumImages.\n"
630 <<
"Setting nThreads equal to 128." << std::endl;
637 size_t sz = (D *
sizeof(int) +
sizeof(
cudaReal)) * nThreads;
638 _computeMinimumImages<D><<<nBlocks, nThreads, sz>>>
639 (minImages_.cArray(), kSq_.cArray(), kBasis.
cArray(),
640 meshDims.cArray(), kSize_);
642 hasMinImages_ =
true;
656 if (!hasMinImages_) {
673 for (
int j = 0; j < D; ++j) {
674 for (
int k = 0; k < D; ++k) {
675 kBasis_h[idx] = unitCell().kBasis(j)[k];
682 int nBlocks, nThreads;
688 _computeKSq<D><<<nBlocks, nThreads>>>
689 (kSq_.cArray(), minImages_.cArray(), kBasis.
cArray(),
690 unitCell().nParameter(), kSize_);
701 if (hasdKSq_)
return;
704 if (!hasMinImages_) {
718 for (
int i = 0 ; i < unitCell().nParameter(); ++i) {
719 for (
int j = 0; j < D; ++j) {
720 for (
int k = 0; k < D; ++k) {
721 idx = k + (j * D) + (i * D * D);
722 dkkBasis_h[idx] = unitCell().dkkBasis(i, j, k);
726 dkkBasis = dkkBasis_h;
729 int nBlocks, nThreads;
739 bool const * implicitInversePtr =
nullptr;
742 implicitInversePtr = implicitInverse_.cArray();
747 _computedKSq<D><<<nBlocks, nThreads, sz>>>
748 (dKSq_.cArray(), minImages_.cArray(), dkkBasis.
cArray(),
749 implicitInversePtr, unitCell().nParameter(), kSize_,
762 if (isSorted_)
return;
774 std::vector< Sort::Item<double> > items;
776 for (
int i = 0; i < kSize_; ++i) {
777 item.value = kSq_h[i];
779 items.push_back(item);
787 if (!sortedIds_.isAllocated()) {
788 sortedIds_.allocate(kSize_);
791 for (
int i = 0; i < kSize_; ++i) {
792 sortedIds_[i] = items[i].id;
796 double epsilon = 1.0E-8;
797 sortedBunches_.clear();
799 nBunch_ = sortedBunches_.size();
803 if (!bunchIds_.isAllocated()) {
804 bunchIds_.allocate(kSize_);
807 int begin, end, ib, iw;
808 for (ib = 0; ib < nBunch_; ++ib) {
809 begin = sortedBunches_[ib][0];
810 end = sortedBunches_[ib][1];
812 for (iw = begin; iw < end; ++iw) {
813 bunchIds_[sortedIds_[iw]] = ib;
830 if (!hasMinImages_h_) {
832 minImages_temp = minImages_;
834 for (j = 0; j < D; ++j) {
836 for (i = 0; i < kSize_; ++i) {
837 minImages_h_[i][j] = minImages_temp[i + k];
840 hasMinImages_h_ =
true;
Dynamic array on the GPU device with aligned data.
int capacity() const
Return array capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
An IntVec<D, T> is a D-component vector of elements of integer type T.
Iterator over points in a Mesh<D>.
int rank() const
Get the rank of current element.
void begin()
Set iterator to the first point in the mesh.
bool atEnd() const
Is this the end (i.e., one past the last point)?
IntVec< D > position() const
Get current position in the grid, as integer vector.
Description of a regular grid of points in a periodic domain.
int size() const
Get total number of grid points.
static void computeKMesh(IntVec< D > const &rMeshDimensions, IntVec< D > &kMeshDimensions, int &kSize)
Compute dimensions and size of k-space mesh for DFT of real data.
void sortWaves()
Sort waves in order of ascending wavevector norm.
void clearUnitCellData()
Clear all internal data that depends on lattice parameters.
void computedKSq()
Compute derivatives of |k|^2 w/ respect to unit cell parameters.
WaveList(bool isRealField=true)
Constructor.
void computeMinimumImages()
Compute minimum images of wavevectors, and also calculate kSq.
void allocate(Mesh< D > const &m, UnitCell< D > const &c)
Allocate memory and set association with a Mesh and UnitCell object.
void computeKSq()
Compute square norm |k|^2 for all wavevectors.
bool isRealField() const
Is this WaveList set up for use with real-valued fields?
HostDArray< IntVec< D > > const & minImages_h() const
Get minimum images as IntVec<D> objects on the host.
int nParameter() const
Get the number of parameters in the unit cell.
Base template for UnitCell<D> classes, D=1, 2 or 3.
static std::ostream & file()
Get log ostream by reference.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
dim3 meshDims()
Return last requested multidimensional grid of threads.
void findBunches(std::vector< Item< T > > const &items, GArray< Bunch > &bunches, T epsilon)
Identify "bunches" of equal values within a sorted vector.
void sort(std::vector< Item< T > > &items)
Sort a std::vector< Item<T> > by ascending item value.
Fields, FFTs, and utilities for periodic boundary conditions (CUDA).
Periodic fields and crystallography.
bool hasVariableAngle(typename UnitCell< D >::LatticeSystem lattice)
Return true if lattice type has variable angle parameters.
PSCF package top-level namespace.
cufftDoubleReal cudaReal
Real number type used in CPU code that uses FFTW.
Struct with value and index, to keep track of permutation.