1#ifndef RPG_WAVE_LIST_TPP
2#define RPG_WAVE_LIST_TPP
12#include <prdc/cuda/resources.h>
13#include <pscf/mesh/MeshIterator.h>
28 __global__
void _computeMinimumImages(
int* minImages, cudaReal* kSq,
29 cudaReal
const * kBasis,
42 __global__
void _computeMinimumImages<1>(
int* minImages, cudaReal* kSq,
43 cudaReal
const * kBasis,
47 unsigned int nThreads = blockDim.x * gridDim.x;
48 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
51 cudaReal kBasis_ = kBasis[0];
54 for (
int i = startID; i < kSize; i +=
nThreads) {
55 int img = minImages[i];
57 while (img > (meshDims_>>1)) {
60 while (img < -1*(meshDims_>>1)) {
65 kSq[i] = img * img * kBasis_ * kBasis_;
83 __global__
void _computeMinimumImages<2>(
int* minImages, cudaReal* kSq,
84 cudaReal
const * kBasis,
88 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
91 unsigned int paramID = (startID >> 5);
92 unsigned int imageID = startID - (paramID * 32);
98 s1 = imageID - (s0 * 5);
103 #ifdef SINGLE_PRECISION
104 cudaReal epsilon = 1.0E-4;
106 cudaReal epsilon = 1.0E-8;
111 extern __shared__ cudaReal kSqVals_[];
112 int* images_ = (
int*)&kSqVals_[blockDim.x];
114 if (paramID < kSize) {
117 images_[2*threadIdx.x] = minImages[paramID] + (s0 *
meshDims[0]);
118 images_[2*threadIdx.x+1] = minImages[kSize + paramID] +
122 cudaReal kVec0 = (kBasis[0] * images_[2*threadIdx.x]) +
123 (kBasis[2] * images_[2*threadIdx.x+1]);
124 cudaReal kVec1 = (kBasis[1] * images_[2*threadIdx.x]) +
125 (kBasis[3] * images_[2*threadIdx.x+1]);
127 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1);
133 for (
int stride = 16; stride > 0; stride >>= 1) {
134 if (paramID < kSize) {
135 if (imageID < stride) {
137 if (kSqVals_[threadIdx.x+stride] <
138 (kSqVals_[threadIdx.x] - epsilon)) {
140 }
else if (kSqVals_[threadIdx.x+stride] <
141 (kSqVals_[threadIdx.x] + epsilon)) {
144 for (
int i = 0; i < 2; ++i) {
145 if (images_[2*threadIdx.x+i] >
146 images_[2*(threadIdx.x+stride)+i]) {
149 if (images_[2*threadIdx.x+i] <
150 images_[2*(threadIdx.x+stride)+i]) {
157 images_[2*threadIdx.x] =
158 images_[2*(threadIdx.x+stride)];
159 images_[2*threadIdx.x+1] =
160 images_[2*(threadIdx.x+stride)+1];
161 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
172 if ((imageID == 0) && (paramID < kSize)) {
173 kSq[paramID] = kSqVals_[threadIdx.x];
174 minImages[paramID] = images_[2*threadIdx.x];
175 minImages[paramID + kSize] = images_[2*threadIdx.x+1];
193 __global__
void _computeMinimumImages<3>(
int* minImages, cudaReal* kSq,
194 cudaReal
const * kBasis,
195 int const * meshDims,
198 unsigned int startID = blockIdx.x * blockDim.x + threadIdx.x;
201 unsigned int paramID = (startID >> 7);
202 unsigned int imageID = startID - (paramID * 128);
208 s1 = (imageID - (s0 * 25)) / 5;
209 s2 = imageID - (s0 * 25) - (s1 * 5);
215 #ifdef SINGLE_PRECISION
216 cudaReal epsilon = 1.0E-4;
218 cudaReal epsilon = 1.0E-8;
223 extern __shared__ cudaReal kSqVals_[];
224 int* images_ = (
int*)&kSqVals_[blockDim.x];
226 if (paramID < kSize) {
229 images_[3*threadIdx.x] = minImages[paramID] + (s0 *
meshDims[0]);
230 images_[3*threadIdx.x+1] = minImages[kSize + paramID] +
232 images_[3*threadIdx.x+2] = minImages[kSize + kSize + paramID] +
236 cudaReal kVec0(0.0), kVec1(0.0), kVec2(0.0);
237 for (
int k = 0; k < 3; ++k) {
238 kVec0 += kBasis[3*k] * images_[3*threadIdx.x + k];
239 kVec1 += kBasis[3*k+1] * images_[3*threadIdx.x + k];
240 kVec2 += kBasis[3*k+2] * images_[3*threadIdx.x + k];
243 kSqVals_[threadIdx.x] = (kVec0*kVec0) + (kVec1*kVec1) +
250 for (
int stride = 64; stride > 0; stride >>= 1) {
251 if (paramID < kSize) {
252 if (imageID < stride) {
254 if (kSqVals_[threadIdx.x+stride] <
255 (kSqVals_[threadIdx.x] - epsilon)) {
257 }
else if (kSqVals_[threadIdx.x+stride] <
258 (kSqVals_[threadIdx.x] + epsilon)) {
261 for (
int i = 0; i < 3; ++i) {
262 if (images_[3*threadIdx.x+i] >
263 images_[3*(threadIdx.x+stride)+i]) {
266 if (images_[3*threadIdx.x+i] <
267 images_[3*(threadIdx.x+stride)+i]) {
274 images_[3*threadIdx.x] =
275 images_[3*(threadIdx.x+stride)];
276 images_[3*threadIdx.x+1] =
277 images_[3*(threadIdx.x+stride)+1];
278 images_[3*threadIdx.x+2] =
279 images_[3*(threadIdx.x+stride)+2];
280 kSqVals_[threadIdx.x] = kSqVals_[threadIdx.x+stride];
290 if ((imageID == 0) && (paramID < kSize)) {
291 kSq[paramID] = kSqVals_[threadIdx.x];
292 minImages[paramID] = images_[3*threadIdx.x];
293 minImages[paramID + kSize] = images_[3*threadIdx.x+1];
294 minImages[paramID + kSize + kSize] = images_[3*threadIdx.x+2];
302 __global__
void _computeKSq(cudaReal* kSq,
int const * waveBz,
303 cudaReal
const * kBasis,
304 int const nParams,
int const kSize)
306 int nThreads = blockDim.x * gridDim.x;
307 int startID = blockIdx.x * blockDim.x + threadIdx.x;
310 __shared__ cudaReal kBasis_[D*D];
311 if (threadIdx.x < D * D) {
312 kBasis_[threadIdx.x] = kBasis[threadIdx.x];
317 int i, j, k, waveBz_;
318 cudaReal kVec[D], kSqVal;
325 for (i = startID; i < kSize; i +=
nThreads) {
328 for (j = 0; j < D; ++j) {
333 for (j = 0; j < D; ++j) {
334 waveBz_ = waveBz[i + (j * kSize)];
335 for (k = 0; k < D; ++k) {
336 kVec[k] += kBasis_[k + (D*j)] * waveBz_;
342 for (j = 0; j < D; ++j) {
343 kSqVal += kVec[j] * kVec[j];
356 __global__
void _computedKSq(cudaReal* dKSq,
int const * waveBz,
357 cudaReal
const * dkkBasis,
358 bool const * implicitInverse,
359 int const nParams,
int const kSize)
363 int nThreads = blockDim.x * gridDim.x;
364 int startID = blockIdx.x * blockDim.x + threadIdx.x;
368 extern __shared__ cudaReal dkkBasis_[];
369 if (threadIdx.x < nParams * D * D) {
370 dkkBasis_[threadIdx.x] = dkkBasis[threadIdx.x];
375 int param, i, j, k, waveBz_[D];
379 for (param = 0; param < nParams; ++param) {
380 for (i = startID; i < kSize; i +=
nThreads) {
386 for (j = 0; j < D; ++j) {
387 waveBz_[j] = waveBz[i + (j * kSize)];
391 for (j = 0; j < D; ++j) {
392 for (k = 0; k < D; ++k) {
393 dKSqVal += waveBz_[j] * waveBz_[k]
394 * dkkBasis_[k + (j * D) + (param * D * D)];
398 if (implicitInverse[i]) {
402 dKSq[(param * kSize) + i] = dKSqVal;
413 hasMinimumImages_(false),
416 unitCellPtr_(nullptr),
435 int nParams = unitCell().nParameter();
439 for(
int i = 0; i < D; ++i) {
441 kMeshDimensions_[i] = mesh().dimension(i);
443 kMeshDimensions_[i] = mesh().dimension(i) / 2 + 1;
445 kSize_ *= kMeshDimensions_[i];
448 minImages_.allocate(kSize_ * D);
449 kSq_.allocate(kMeshDimensions_);
450 dKSq_.allocate(kSize_ * nParams);
451 dKSqSlices_.allocate(nParams);
452 for (
int i = 0; i < nParams; i++) {
453 dKSqSlices_[i].associate(dKSq_, i*kSize_, kMeshDimensions_);
457 implicitInverse_.allocate(kSize_);
465 inverseId = mesh().dimension(D-1) - kItr.
position(D-1);
467 if (inverseId > kMeshDimensions_[D-1]) {
468 implicitInverse_h[kItr.
rank()] =
true;
470 implicitInverse_h[kItr.
rank()] =
false;
473 implicitInverse_ = implicitInverse_h;
481 if (hasVariableAngle()) {
482 hasMinimumImages_ =
false;
491 if (hasMinimumImages_)
return;
497 UTIL_CHECK(minImages_.capacity() == kSize_ * D);
502 for (
int i = 0; i < D; i++) {
507 minImages_ = imagesTmp;
515 for(
int j = 0; j < D; ++j) {
516 for(
int k = 0; k < D; ++k) {
517 kBasis_h[idx] = unitCell().kBasis(j)[k];
520 meshDims_h[j] = mesh().dimension(j);
523 meshDims = meshDims_h;
536 int nBlocks, nThreads;
539 if ((D == 3) && (nThreads < 128)) {
546 <<
"nThreads too small for computeMinimumImages.\n"
547 <<
"Setting nThreads equal to 128." << std::endl;
551 size_t sz = (D *
sizeof(int) +
sizeof(cudaReal)) * nThreads;
552 _computeMinimumImages<D><<<nBlocks, nThreads, sz>>>
553 (minImages_.cArray(), kSq_.cArray(), kBasis.
cArray(),
554 meshDims.cArray(), kSize_);
556 hasMinimumImages_ =
true;
565 if (!hasMinimumImages_) {
566 computeMinimumImages();
581 for(
int j = 0; j < D; ++j) {
582 for(
int k = 0; k < D; ++k) {
583 kBasis_h[idx] = unitCell().kBasis(j)[k];
590 int nBlocks, nThreads;
594 _computeKSq<D><<<nBlocks, nThreads>>>
595 (kSq_.cArray(), minImages_.cArray(), kBasis.
cArray(),
596 unitCell().nParameter(), kSize_);
604 if (hasdKSq_)
return;
607 if (!hasMinimumImages_) {
608 computeMinimumImages();
620 for(
int i = 0 ; i < unitCell().nParameter(); ++i) {
621 for(
int j = 0; j < D; ++j) {
622 for(
int k = 0; k < D; ++k) {
623 idx = k + (j * D) + (i * D * D);
624 dkkBasis_h[idx] = unitCell().dkkBasis(i, j, k);
628 dkkBasis = dkkBasis_h;
631 int nBlocks, nThreads;
639 size_t sz =
sizeof(cudaReal)*dkkBasis.
capacity();
640 _computedKSq<D><<<nBlocks, nThreads, sz>>>
641 (dKSq_.cArray(), minImages_.cArray(), dkkBasis.
cArray(),
642 implicitInverse_.cArray(), unitCell().nParameter(), kSize_);
Dynamic array on the GPU device with aligned data.
int capacity() const
Return allocated capacity.
Data * cArray()
Return pointer to underlying C array.
Template for dynamic array stored in host CPU memory.
Iterator over points in a Mesh<D>.
int rank() const
Get the rank of current element.
void begin()
Set iterator to the first point in the mesh.
bool atEnd() const
Is this the end (i.e., one past the last point)?
IntVec< D > position() const
Get current position in the grid, as integer vector.
Description of a regular grid of points in a periodic domain.
int size() const
Get total number of grid points.
int nParameter() const
Get the number of parameters in the unit cell.
Base template for UnitCell<D> classes, D=1, 2 or 3.
void computedKSq()
Compute derivatives of |k|^2 w/ respect to unit cell parameters.
void computeMinimumImages()
Compute minimum images of wavevectors.
void clearUnitCellData()
Clear all internal data that depends on lattice parameters.
void allocate(Mesh< D > const &m, UnitCell< D > const &c)
Allocate memory and set association with a Mesh and UnitCell object.
void computeKSq()
Compute sq.
static std::ostream & file()
Get log ostream by reference.
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
void setThreadsLogical(int nThreadsLogical)
Given total number of threads, set 1D execution configuration.
int nThreads()
Get the number of threads per block for execution.
void setThreadsPerBlock()
Set the number of threads per block to a default value.
dim3 meshDims()
Return last requested multidimensional grid of threads.
PSCF package top-level namespace.