1#ifndef PRDC_CUDA_WAVE_LIST_TPP
2#define PRDC_CUDA_WAVE_LIST_TPP
13#include <prdc/cuda/resources.h>
14#include <prdc/cuda/FFT.h>
15#include <prdc/crystal/hasVariableAngle.h>
16#include <pscf/cuda/HostDArray.h>
17#include <pscf/mesh/MeshIterator.h>
18#include <pscf/math/IntVec.h>
32 __global__
void _computeMinimumImages(
int* minImages, cudaReal* kSq,
33 cudaReal
const * kBasis,
46 __global__
void _computeMinimumImages<1>(
int* minImages, cudaReal* kSq,
47 cudaReal
const * kBasis,
51 unsigned int nThreads = blockDim.x * gridDim.x;
52 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
55 cudaReal kBasis_ = kBasis[0];
58 for (
int i = startID; i < kSize; i +=
nThreads) {
59 int img = minImages[i];
61 while (img > (meshDims_>>1)) {
64 while (img < -1*(meshDims_>>1)) {
69 kSq[i] = img * img * kBasis_ * kBasis_;
87 __global__
void _computeMinimumImages<2>(
int* minImages, cudaReal* kSq,
88 cudaReal
const * kBasis,
92 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
95 unsigned int paramID = (startID >> 5);
96 unsigned int imageID = startID - (paramID * 32);
102 s1 = imageID - (s0 * 5);
107 #ifdef SINGLE_PRECISION
108 cudaReal epsilon = 1.0E-4;
110 cudaReal epsilon = 1.0E-8;
115 extern __shared__ cudaReal kSqVals_[];
116 int* images_ = (
int*)&kSqVals_[blockDim.x];
118 if (paramID < kSize) {
121 images_[2*threadIdx.x] = minImages[paramID] + (s0 *
meshDims[0]);
122 images_[2*threadIdx.x+1] = minImages[kSize + paramID] +
126 cudaReal kVec0 = (kBasis[0] * images_[2*threadIdx.x]) +
127 (kBasis[2] * images_[2*threadIdx.x+1]);
128 cudaReal kVec1 = (kBasis[1] * images_[2*threadIdx.x]) +
129 (kBasis[3] * images_[2*threadIdx.x+1]);
131 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1);
137 for (
int stride = 16; stride > 0; stride >>= 1) {
138 if (paramID < kSize) {
139 if (imageID < stride) {
141 if (kSqVals_[threadIdx.x+stride] <
142 (kSqVals_[threadIdx.x] - epsilon)) {
144 }
else if (kSqVals_[threadIdx.x+stride] <
145 (kSqVals_[threadIdx.x] + epsilon)) {
148 for (
int i = 0; i < 2; ++i) {
149 if (images_[2*threadIdx.x+i] >
150 images_[2*(threadIdx.x+stride)+i]) {
153 if (images_[2*threadIdx.x+i] <
154 images_[2*(threadIdx.x+stride)+i]) {
161 images_[2*threadIdx.x] =
162 images_[2*(threadIdx.x+stride)];
163 images_[2*threadIdx.x+1] =
164 images_[2*(threadIdx.x+stride)+1];
165 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
176 if ((imageID == 0) && (paramID < kSize)) {
177 kSq[paramID] = kSqVals_[threadIdx.x];
178 minImages[paramID] = images_[2*threadIdx.x];
179 minImages[paramID + kSize] = images_[2*threadIdx.x+1];
197 __global__
void _computeMinimumImages<3>(
int* minImages, cudaReal* kSq,
198 cudaReal
const * kBasis,
199 int const * meshDims,
202 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
205 unsigned int paramID = (startID >> 7);
206 unsigned int imageID = startID - (paramID * 128);
212 s1 = (imageID - (s0 * 25)) / 5;
213 s2 = imageID - (s0 * 25) - (s1 * 5);
219 #ifdef SINGLE_PRECISION
220 cudaReal epsilon = 1.0E-4;
222 cudaReal epsilon = 1.0E-8;
227 extern __shared__ cudaReal kSqVals_[];
228 int* images_ = (
int*)&kSqVals_[blockDim.x];
230 if (paramID < kSize) {
233 images_[3*threadIdx.x] = minImages[paramID] + (s0 *
meshDims[0]);
234 images_[3*threadIdx.x+1] = minImages[kSize + paramID] +
236 images_[3*threadIdx.x+2] = minImages[kSize + kSize + paramID] +
240 cudaReal kVec0(0.0), kVec1(0.0), kVec2(0.0);
241 for (
int k = 0; k < 3; ++k) {
242 kVec0 += kBasis[3*k] * images_[3*threadIdx.x + k];
243 kVec1 += kBasis[3*k+1] * images_[3*threadIdx.x + k];
244 kVec2 += kBasis[3*k+2] * images_[3*threadIdx.x + k];
247 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1) +
254 for (
int stride = 64; stride > 0; stride >>= 1) {
255 if (paramID < kSize) {
256 if (imageID < stride) {
258 if (kSqVals_[threadIdx.x+stride] <
259 (kSqVals_[threadIdx.x] - epsilon)) {
261 }
else if (kSqVals_[threadIdx.x+stride] <
262 (kSqVals_[threadIdx.x] + epsilon)) {
265 for (
int i = 0; i < 3; ++i) {
266 if (images_[3*threadIdx.x+i] >
267 images_[3*(threadIdx.x+stride)+i]) {
270 if (images_[3*threadIdx.x+i] <
271 images_[3*(threadIdx.x+stride)+i]) {
278 images_[3*threadIdx.x] =
279 images_[3*(threadIdx.x+stride)];
280 images_[3*threadIdx.x+1] =
281 images_[3*(threadIdx.x+stride)+1];
282 images_[3*threadIdx.x+2] =
283 images_[3*(threadIdx.x+stride)+2];
284 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
294 if ((imageID == 0) && (paramID < kSize)) {
295 kSq[paramID] = kSqVals_[threadIdx.x];
296 minImages[paramID] = images_[3*threadIdx.x];
297 minImages[paramID + kSize] = images_[3*threadIdx.x+1];
298 minImages[paramID + kSize + kSize] = images_[3*threadIdx.x+2];
306 __global__
void _computeKSq(cudaReal* kSq,
int const * waveBz,
307 cudaReal
const * kBasis,
308 int const nParams,
int const kSize)
310 int nThreads = blockDim.x * gridDim.x;
311 int startID = blockIdx.x * blockDim.x + threadIdx.x;
314 __shared__ cudaReal kBasis_[D*D];
315 if (threadIdx.x < D * D) {
316 kBasis_[threadIdx.x] = kBasis[threadIdx.x];
321 int i, j, k, waveBz_;
322 cudaReal kVec[D], kSqVal;
329 for (i = startID; i < kSize; i +=
nThreads) {
332 for (j = 0; j < D; ++j) {
337 for (j = 0; j < D; ++j) {
338 waveBz_ = waveBz[i + (j * kSize)];
339 for (k = 0; k < D; ++k) {
340 kVec[k] += kBasis_[k + (D*j)] * waveBz_;
346 for (j = 0; j < D; ++j) {
347 kSqVal += kVec[j] * kVec[j];
360 __global__
void _computedKSq(cudaReal* dKSq,
int const * waveBz,
361 cudaReal
const * dkkBasis,
362 bool const * implicitInverse,
363 int const nParams,
int const kSize,
368 int nThreads = blockDim.x * gridDim.x;
369 int startID = blockIdx.x * blockDim.x + threadIdx.x;
373 extern __shared__ cudaReal dkkBasis_[];
374 if (threadIdx.x < nParams * D * D) {
375 dkkBasis_[threadIdx.x] = dkkBasis[threadIdx.x];
380 int param, i, j, k, waveBz_[D];
384 for (param = 0; param < nParams; ++param) {
385 for (i = startID; i < kSize; i +=
nThreads) {
391 for (j = 0; j < D; ++j) {
392 waveBz_[j] = waveBz[i + (j * kSize)];
396 for (j = 0; j < D; ++j) {
397 for (k = 0; k < D; ++k) {
398 dKSqVal += waveBz_[j] * waveBz_[k]
399 * dkkBasis_[k + (j * D) + (param * D * D)];
404 if (implicitInverse[i]) {
410 dKSq[(param * kSize) + i] = dKSqVal;
425 hasMinImages_(false),
426 hasMinImages_h_(false),
429 unitCellPtr_(nullptr),
431 { isRealField_ = isRealField; }
437 WaveList<D>::~WaveList()
444 void WaveList<D>::allocate(Mesh<D>
const & m, UnitCell<D>
const & c)
454 int nParams = unitCell().nParameter();
455 IntVec<D>
const & meshDimensions = mesh().dimensions();
459 FFT<D>::computeKMesh(meshDimensions, kMeshDimensions_, kSize_);
461 kMeshDimensions_ = meshDimensions;
462 kSize_ = mesh().size();
465 minImages_.allocate(kSize_ * D);
466 kSq_.allocate(kMeshDimensions_);
467 dKSq_.allocate(kSize_ * nParams);
468 dKSqSlices_.allocate(nParams);
469 for (
int i = 0; i < nParams; i++) {
470 dKSqSlices_[i].associate(dKSq_, i*kSize_, kMeshDimensions_);
476 implicitInverse_.allocate(kSize_);
477 MeshIterator<D> kItr(kMeshDimensions_);
478 HostDArray<bool> implicitInverse_h(kSize_);
480 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
481 if (kItr.position(D-1) == 0) {
484 inverseId = mesh().dimension(D-1) - kItr.position(D-1);
486 if (inverseId > kMeshDimensions_[D-1]) {
487 implicitInverse_h[kItr.rank()] =
true;
489 implicitInverse_h[kItr.rank()] =
false;
492 implicitInverse_ = implicitInverse_h;
503 void WaveList<D>::clearUnitCellData()
507 if (hasVariableAngle<D>(unitCell().lattice())) {
508 hasMinImages_ =
false;
509 hasMinImages_h_ =
false;
517 void WaveList<D>::computeMinimumImages()
520 if (hasMinImages_)
return;
524 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
526 UTIL_CHECK(minImages_.capacity() == kSize_ * D);
529 HostDArray<int> imagesTmp(D*kSize_);
530 MeshIterator<D> kItr(kMeshDimensions_);
531 for (
int i = 0; i < D; i++) {
532 for (kItr.begin(); !kItr.atEnd(); ++kItr) {
533 imagesTmp[kItr.rank() + (i*kSize_)] = kItr.position(i);
536 minImages_ = imagesTmp;
539 HostDArray<cudaReal> kBasis_h(D*D);
540 HostDArray<int> meshDims_h(D);
541 DeviceArray<cudaReal> kBasis(D*D);
544 for(
int j = 0; j < D; ++j) {
545 for(
int k = 0; k < D; ++k) {
546 kBasis_h[idx] = unitCell().kBasis(j)[k];
549 meshDims_h[j] = mesh().dimension(j);
566 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
568 if ((D == 3) && (nThreads < 128)) {
570 ThreadArray::setThreadsPerBlock(128);
571 ThreadArray::setThreadsLogical(kSize_*threadsPerGP, nBlocks, nThreads);
575 <<
"nThreads too small for computeMinimumImages.\n"
576 <<
"Setting nThreads equal to 128." << std::endl;
580 size_t sz = (D *
sizeof(int) +
sizeof(cudaReal)) * nThreads;
582 (minImages_.cArray(), kSq_.cArray(), kBasis.cArray(),
585 hasMinImages_ =
true;
593 void WaveList<D>::computeKSq()
599 if (!hasMinImages_) {
600 computeMinimumImages();
608 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
612 HostDArray<cudaReal> kBasis_h(D*D);
613 DeviceArray<cudaReal> kBasis(D*D);
615 for(
int j = 0; j < D; ++j) {
616 for(
int k = 0; k < D; ++k) {
617 kBasis_h[idx] = unitCell().kBasis(j)[k];
625 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
629 (kSq_.cArray(), minImages_.cArray(), kBasis.cArray(),
630 unitCell().nParameter(), kSize_);
639 void WaveList<D>::computedKSq()
641 if (hasdKSq_)
return;
644 if (!hasMinImages_) {
645 computeMinimumImages();
650 UTIL_CHECK(unitCell().lattice() != UnitCell<D>::Null);
655 HostDArray<cudaReal> dkkBasis_h(unitCell().nParameter() * D * D);
656 DeviceArray<cudaReal> dkkBasis;
657 for(
int i = 0 ; i < unitCell().nParameter(); ++i) {
658 for(
int j = 0; j < D; ++j) {
659 for(
int k = 0; k < D; ++k) {
660 idx = k + (j * D) + (i * D * D);
661 dkkBasis_h[idx] = unitCell().dkkBasis(i, j, k);
665 dkkBasis = dkkBasis_h;
669 ThreadArray::setThreadsLogical(kSize_, nBlocks, nThreads);
676 size_t sz =
sizeof(cudaReal)*dkkBasis.capacity();
678 (dKSq_.cArray(), minImages_.cArray(), dkkBasis.cArray(),
679 implicitInverse_.cArray(), unitCell().nParameter(), kSize_,
694 if (!hasMinImages_h_) {
696 minImages_temp = minImages_;
698 for (j = 0; j < D; ++j) {
700 for (i = 0; i < kSize_; ++i) {
701 minImages_h_[i][j] = minImages_temp[i + k];
704 hasMinImages_h_ =
true;
Template for dynamic array stored in host CPU memory.
WaveList(bool isRealField=true)
Constructor.
HostDArray< IntVec< D > > const & minImages_h() const
Get minimum images as IntVec<D> objects on the host.
static std::ostream & file()
Get log ostream by reference.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
int nThreads()
Get the number of threads per block for execution.
int nBlocks()
Get the current number of blocks for execution.
dim3 meshDims()
Return last requested multidimensional grid of threads.
Fields, FFTs, and utilities for periodic boundary conditions (CUDA)
Periodic fields and crystallography.
PSCF package top-level namespace.