PSCF v1.1
pspg/solvers/Block.tpp
1#ifndef PSPG_BLOCK_TPP
2#define PSPG_BLOCK_TPP
3
4/*
5* PSCF - Polymer Self-Consistent Field Theory
6*
7* Copyright 2016 - 2022, The Regents of the University of Minnesota
8* Distributed under the terms of the GNU General Public License.
9*/
10
11#include "Block.h"
12#include <pspg/math/GpuResources.h>
13#include <pscf/mesh/Mesh.h>
14#include <pscf/mesh/MeshIterator.h>
15#include <pscf/crystal/shiftToMinimum.h>
16#include <util/containers/FMatrix.h> // member template
17#include <util/containers/DArray.h> // member template
18#include <util/containers/FArray.h> // member template
19#include <sys/time.h>
20
21using namespace Util;
22
23namespace Pscf {
24namespace Pspg {
25
26 // CUDA kernels (only used in this file)
27
28 static __global__
29 void mulDelKsq(cudaReal* result, const cudaComplex* q1,
30 const cudaComplex* q2, const cudaReal* delKsq,
31 int paramN, int kSize, int rSize)
32 {
33 int nThreads = blockDim.x * gridDim.x;
34 int startID = blockIdx.x * blockDim.x + threadIdx.x;
35 for (int i = startID; i < kSize; i += nThreads) {
36 #ifdef SINGLE_PRECISION
37 result[i] = cuCmulf( q1[i],
38 cuConjf(q2[i])).x * delKsq[paramN * rSize + i];
39 #else
40 result[i] = cuCmul( q1[i],
41 cuConj(q2[i])).x * delKsq[paramN * rSize + i];
42 #endif
43 }
44 }
45
46 static __global__
47 void pointwiseMulSameStart(const cudaReal* a, const cudaReal* expW,
48 const cudaReal* expW2,
49 cudaReal* q1, cudaReal* q2,
50 int size)
51 {
52 int nThreads = blockDim.x * gridDim.x;
53 int startID = blockIdx.x * blockDim.x + threadIdx.x;
54 cudaReal input;
55 for (int i = startID; i < size; i += nThreads) {
56 input = a[i];
57 q1[i] = expW[i] * input;
58 q2[i] = expW2[i] * input;
59 }
60 }
61
62 static __global__
63 void pointwiseMulTwinned(const cudaReal* qr1,
64 const cudaReal* qr2,
65 const cudaReal* expW,
66 cudaReal* q1, cudaReal* q2, int size)
67 {
68 int nThreads = blockDim.x * gridDim.x;
69 int startID = blockIdx.x * blockDim.x + threadIdx.x;
70 cudaReal scale;
71 for (int i = startID; i < size; i += nThreads) {
72 scale = expW[i];
73 q1[i] = qr1[i] * scale;
74 q2[i] = qr2[i] * scale;
75 }
76 }
77
78 static __global__
79 void scaleComplexTwinned(cudaComplex* qk1, cudaComplex* qk2,
80 const cudaReal* expksq1,
81 const cudaReal* expksq2,
82 int size)
83 {
84 int nThreads = blockDim.x * gridDim.x;
85 int startID = blockIdx.x * blockDim.x + threadIdx.x;
86 for (int i = startID; i < size; i += nThreads) {
87 qk1[i].x *= expksq1[i];
88 qk1[i].y *= expksq1[i];
89 qk2[i].x *= expksq2[i];
90 qk2[i].y *= expksq2[i];
91 }
92 }
93
94 static __global__
95 void scaleComplex(cudaComplex* a, cudaReal* scale, int size)
96 {
97 int nThreads = blockDim.x * gridDim.x;
98 int startID = blockIdx.x * blockDim.x + threadIdx.x;
99 for(int i = startID; i < size; i += nThreads) {
100 a[i].x *= scale[i];
101 a[i].y *= scale[i];
102 }
103 }
104
105 static __global__
106 void richardsonExpTwinned(cudaReal* qNew, const cudaReal* q1,
107 const cudaReal* qr, const cudaReal* expW2, int size)
108 {
109 int nThreads = blockDim.x * gridDim.x;
110 int startID = blockIdx.x * blockDim.x + threadIdx.x;
111 cudaReal q2;
112 for (int i = startID; i < size; i += nThreads) {
113 q2 = qr[i] * expW2[i];
114 qNew[i] = (4.0 * q2 - q1[i]) / 3.0;
115 }
116 }
117
118 static __global__
119 void multiplyScaleQQ(cudaReal* result,
120 const cudaReal* p1,
121 const cudaReal* p2,
122 double scale, int size)
123 {
124
125 int nThreads = blockDim.x * gridDim.x;
126 int startID = blockIdx.x * blockDim.x + threadIdx.x;
127
128 for(int i = startID; i < size; i += nThreads) {
129 result[i] += scale * p1[i] * p2[i];
130 }
131
132 }
133
134 // Block<D> member functions
135
136 /*
137 * Constructor.
138 */
139 template <int D>
141 : meshPtr_(0),
142 kMeshDimensions_(0),
143 ds_(0.0),
144 ns_(0),
145 temp_(0),
146 isAllocated_(false),
147 hasExpKsq_(false),
148 expKsq_host(0),
149 expKsq2_host(0)
150 {
151 propagator(0).setBlock(*this);
152 propagator(1).setBlock(*this);
153 }
154
155 /*
156 * Destructor.
157 */
158 template <int D>
160 {
161 if (temp_) {
162 delete[] temp_;
163 cudaFree(d_temp_);
164 }
165
166 if (expKsq_host) {
167 delete[] expKsq_host;
168 delete[] expKsq2_host;
169 }
170 }
171
172 template <int D>
173 void Block<D>::setDiscretization(double ds, Mesh<D> const & mesh)
174 {
175 UTIL_CHECK(mesh.size() > 1);
176 UTIL_CHECK(ds > 0.0);
177 UTIL_CHECK(!isAllocated_);
178
179 // GPU Resources
180 ThreadGrid::setThreadsLogical(mesh.size(),nBlocks_,nThreads_);
181
182 // Set association to mesh
183 meshPtr_ = &mesh;
184
185 // Set contour length discretization for this block
186 int tempNs;
187 tempNs = floor( length()/(2.0 *ds) + 0.5 );
188 if (tempNs == 0) {
189 tempNs = 1;
190 }
191 ns_ = 2*tempNs + 1;
192
193 ds_ = length()/double(ns_ - 1);
194
195 // Compute Fourier space kMeshDimensions_
196 for (int i = 0; i < D; ++i) {
197 if (i < D - 1) {
198 kMeshDimensions_[i] = mesh.dimensions()[i];
199 } else {
200 kMeshDimensions_[i] = mesh.dimensions()[i]/2 + 1;
201 }
202 }
203
204 kSize_ = 1;
205 for(int i = 0; i < D; ++i) {
206 kSize_ *= kMeshDimensions_[i];
207 }
208
209 // Allocate work arrays
210 expKsq_.allocate(kMeshDimensions_);
211 expKsq2_.allocate(kMeshDimensions_);
212 expW_.allocate(mesh.dimensions());
213 expW2_.allocate(mesh.dimensions());
214 qr_.allocate(mesh.dimensions());
215 qr2_.allocate(mesh.dimensions());
216 qk_.allocate(mesh.dimensions());
217 qk2_.allocate(mesh.dimensions());
218 q1_.allocate(mesh.dimensions());
219 q2_.allocate(mesh.dimensions());
220
221 propagator(0).allocate(ns_, mesh);
222 propagator(1).allocate(ns_, mesh);
223 gpuErrchk( cudaMalloc((void**)&qkBatched_,
224 ns_ * kSize_ * sizeof(cudaComplex))
225 );
226 gpuErrchk( cudaMalloc((void**)&qk2Batched_,
227 ns_ * kSize_ * sizeof(cudaComplex))
228 );
229 cField().allocate(mesh.dimensions());
230
231 gpuErrchk(cudaMalloc((void**)&d_temp_, nBlocks_ * sizeof(cudaReal)));
232 temp_ = new cudaReal[nBlocks_];
233
234 expKsq_host = new cudaReal[kSize_];
235 expKsq2_host = new cudaReal[kSize_];
236
237 isAllocated_ = true;
238 hasExpKsq_ = false;
239
240 }
241
242 /*
243 * Set or reset the the block length.
244 */
245 template <int D>
246 void Block<D>::setLength(double length)
247 {
249 if (isAllocated_) {
250 UTIL_CHECK(ns_ > 1);
251 ds_ = length/double(ns_ - 1);
252 }
253 hasExpKsq_ = false;
254 }
255
256 /*
257 * Set or reset the the block length.
258 */
259 template <int D>
260 void Block<D>::setKuhn(double kuhn)
261 {
262 BlockTmpl< Propagator<D> >::setKuhn(kuhn);
263 hasExpKsq_ = false;
264 }
265
266 /*
267 * Setup data that depend on the unit cell parameters.
268 */
269 template <int D>
271 const WaveList<D>& wavelist)
272 {
273 nParams_ = unitCell.nParameter();
274
275 // store pointer to unit cell and wavelist
276 unitCellPtr_ = &unitCell;
277 waveListPtr_ = &wavelist;
278
279 hasExpKsq_ = false;
280 }
281
282 template <int D>
284 {
285 UTIL_CHECK(isAllocated_);
286
287 MeshIterator<D> iter;
288 iter.setDimensions(kMeshDimensions_);
289 IntVec<D> G, Gmin;
290 double Gsq;
291 double factor = -1.0*kuhn()*kuhn()*ds_/6.0;
292
293 // Setup expKsq values on Host then transfer to device
294 int kSize = 1;
295 for(int i = 0; i < D; ++i) {
296 kSize *= kMeshDimensions_[i];
297 }
298
299 int i;
300 for (iter.begin(); !iter.atEnd(); ++iter) {
301 i = iter.rank();
302 Gsq = unitCell().ksq(wavelist().minImage(iter.rank()));
303 expKsq_host[i] = exp(Gsq*factor);
304 expKsq2_host[i] = exp(Gsq*factor / 2);
305 }
306
307 cudaMemcpy(expKsq_.cDField(), expKsq_host,
308 kSize * sizeof(cudaReal), cudaMemcpyHostToDevice);
309 cudaMemcpy(expKsq2_.cDField(), expKsq2_host,
310 kSize * sizeof(cudaReal), cudaMemcpyHostToDevice);
311
312 hasExpKsq_ = true;
313 }
314
315 /*
316 * Setup the contour length step algorithm.
317 */
318 template <int D>
319 void
321 {
322 // Preconditions
323 int nx = mesh().size();
324 UTIL_CHECK(nx > 0);
325 UTIL_CHECK(isAllocated_);
326
327 // Populate expW_
328 assignExp<<<nBlocks_, nThreads_>>>(expW_.cDField(), w.cDField(),
329 (double)0.5* ds_, nx);
330 assignExp<<<nBlocks_, nThreads_>>>(expW2_.cDField(), w.cDField(),
331 (double)0.25 * ds_, nx);
332
333 // Compute expKsq arrays if necessary
334 if (!hasExpKsq_) {
335 computeExpKsq();
336 }
337
338 }
339
340 /*
341 * Integrate to calculate monomer concentration for this block
342 */
343 template <int D>
344 void Block<D>::computeConcentration(double prefactor)
345 {
346 // Preconditions
347 int nx = mesh().size();
348 UTIL_CHECK(nx > 0);
349 UTIL_CHECK(ns_ > 0);
350 UTIL_CHECK(ds_ > 0);
351 UTIL_CHECK(propagator(0).isAllocated());
352 UTIL_CHECK(propagator(1).isAllocated());
353 UTIL_CHECK(cField().capacity() == nx)
354
355 // Initialize cField to zero at all points
356 assignUniformReal<<<nBlocks_, nThreads_>>>
357 (cField().cDField(), 0.0, nx);
358
359 Pscf::Pspg::Propagator<D> const & p0 = propagator(0);
360 Pscf::Pspg::Propagator<D> const & p1 = propagator(1);
361
362 multiplyScaleQQ<<<nBlocks_, nThreads_>>>
363 (cField().cDField(), p0.q(0), p1.q(ns_ - 1), 1.0, nx);
364 multiplyScaleQQ<<<nBlocks_, nThreads_>>>
365 (cField().cDField(), p0.q(ns_-1), p1.q(0), 1.0, nx);
366 for (int j = 1; j < ns_ - 1; j += 2) {
367 // Odd indices
368 multiplyScaleQQ<<<nBlocks_, nThreads_>>>
369 (cField().cDField(), p0.q(j), p1.q(ns_ - 1 - j), 4.0, nx);
370 }
371 for (int j = 2; j < ns_ - 2; j += 2) {
372 // Even indices
373 multiplyScaleQQ<<<nBlocks_, nThreads_>>>
374 (cField().cDField(), p0.q(j), p1.q(ns_ - 1 - j), 2.0, nx);
375 }
376
377 scaleReal<<<nBlocks_, nThreads_>>>
378 (cField().cDField(), (prefactor * ds_/3.0), nx);
379
380 }
381
382 template <int D>
384 if (!fft_.isSetup()) {
385 fft_.setup(qr_, qk_);
386 fftBatched_.setup(mesh().dimensions(), kMeshDimensions_, ns_);
387 }
388 }
389
390 /*
391 * Propagate solution by one step.
392 */
393 template <int D>
394 void Block<D>::step(const cudaReal* q, cudaReal* qNew)
395 {
396 // Preconditions
397 UTIL_CHECK(isAllocated_);
398 UTIL_CHECK(hasExpKsq_);
399
400 // Check real-space mesh sizes
401 int nx = mesh().size();
402 UTIL_CHECK(nx > 0);
403 UTIL_CHECK(qr_.capacity() == nx);
404 UTIL_CHECK(expW_.capacity() == nx);
405
406 // Fourier-space mesh sizes
407 int nk = qk_.capacity();
408 UTIL_CHECK(expKsq_.capacity() == nk);
409
410 // Apply pseudo-spectral algorithm
411
412 pointwiseMulSameStart<<<nBlocks_, nThreads_>>>
413 (q, expW_.cDField(), expW2_.cDField(),
414 qr_.cDField(), qr2_.cDField(), nx);
415 fft_.forwardTransform(qr_, qk_);
416 fft_.forwardTransform(qr2_, qk2_);
417 scaleComplexTwinned<<<nBlocks_, nThreads_>>>
418 (qk_.cDField(), qk2_.cDField(),
419 expKsq_.cDField(), expKsq2_.cDField(), nk);
420 fft_.inverseTransform(qk_, qr_);
421 fft_.inverseTransform(qk2_, q2_);
422 pointwiseMulTwinned<<<nBlocks_, nThreads_>>>
423 (qr_.cDField(), q2_.cDField(), expW_.cDField(),
424 q1_.cDField(), qr_.cDField(), nx);
425 fft_.forwardTransform(qr_, qk_);
426 scaleComplex<<<nBlocks_, nThreads_>>>(qk_.cDField(), expKsq2_.cDField(), nk);
427 fft_.inverseTransform(qk_, qr_);
428 richardsonExpTwinned<<<nBlocks_, nThreads_>>>(qNew, q1_.cDField(),
429 qr_.cDField(), expW2_.cDField(), nx);
430 //remove the use of q2
431
432
433 }
434
435
436 /*
437 * Compute stress contribution from this block.
438 */
439 template <int D>
440 void Block<D>::computeStress(WaveList<D> const & wavelist,
441 double prefactor)
442 {
443 int nx = mesh().size();
444
445 // Preconditions
446 UTIL_CHECK(nx > 0);
447 UTIL_CHECK(ns_ > 0);
448 UTIL_CHECK(ds_ > 0);
449 UTIL_CHECK(propagator(0).isAllocated());
450 UTIL_CHECK(propagator(1).isAllocated());
451
452 double dels, normal, increment;
453 normal = 3.0*6.0;
454
456 int i;
457 for (i = 0; i < 6; ++i) {
458 dQ [i] = 0.0;
459 stress_[i] = 0.0;
460 }
461
462 Pscf::Pspg::Propagator<D> const & p0 = propagator(0);
463 Pscf::Pspg::Propagator<D> const & p1 = propagator(1);
464
465 fftBatched_.forwardTransform(p0.head(), qkBatched_, ns_);
466 fftBatched_.forwardTransform(p1.head(), qk2Batched_, ns_);
467 cudaMemset(qr2_.cDField(), 0, mesh().size() * sizeof(cudaReal));
468
469 for (int j = 0; j < ns_ ; ++j) {
470
471 dels = ds_;
472 if (j != 0 && j != ns_ - 1) {
473 if (j % 2 == 0) {
474 dels = dels*2.0;
475 } else {
476 dels = dels*4.0;
477 }
478 }
479
480 for (int n = 0; n < nParams_ ; ++n) {
481 mulDelKsq<<<nBlocks_, nThreads_ >>>
482 (qr2_.cDField(),
483 qkBatched_ + (j*kSize_),
484 qk2Batched_ + (kSize_ * (ns_ -1 -j)),
485 wavelist.dkSq(), n , kSize_, nx);
486
487 increment = gpuSum(qr2_.cDField(), mesh().size());
488 increment = (increment * kuhn() * kuhn() * dels)/normal;
489 dQ [n] = dQ[n]-increment;
490 }
491
492 }
493
494 // Normalize
495 for (i = 0; i < nParams_; ++i) {
496 stress_[i] = stress_[i] - (dQ[i] * prefactor);
497 }
498 }
499
500}
501}
502#endif
virtual void setLength(double length)
Set the length of this block.
Class template for a block in a block copolymer.
Definition: BlockTmpl.h:89
Propagator< D > & propagator(int directionId)
Get a Propagator for a specified direction.
Definition: BlockTmpl.h:174
An IntVec<D, T> is a D-component vector of elements of integer type T.
Definition: IntVec.h:27
Iterator over points in a Mesh<D>.
Definition: MeshIterator.h:29
int rank() const
Get the rank of current element.
Definition: MeshIterator.h:136
void begin()
Set iterator to the first point in the mesh.
bool atEnd() const
Is this the end (i.e., one past the last point)?
Definition: MeshIterator.h:141
void setDimensions(const IntVec< D > &dimensions)
Set the grid dimensions in all directions.
Description of a regular grid of points in a periodic domain.
Definition: Mesh.h:61
IntVec< D > dimensions() const
Get an IntVec<D> of the grid dimensions.
Definition: Mesh.h:217
int size() const
Get total number of grid points.
Definition: Mesh.h:229
Block within a branched polymer.
void computeConcentration(double prefactor)
Compute unnormalized concentration for block by integration.
void setKuhn(double kuhn)
Set or reset monomer statistical segment length.
void computeStress(WaveList< D > const &waveList, double prefactor)
Compute derivatives of free energy w/ respect to cell parameters.
void setupSolver(RDField< D > const &w)
Set solver for this block.
void setDiscretization(double ds, const Mesh< D > &mesh)
Initialize discretization and allocate required memory.
void setupUnitCell(UnitCell< D > const &unitCell, WaveList< D > const &waveList)
Setup parameters that depend on the unit cell.
void setupFFT()
Initialize FFT and batch FFT classes.
void setLength(double length)
Set or reset block length.
void step(cudaReal const *q, cudaReal *qNew)
Compute step of integration loop, from i to i+1.
Data * cDField()
Return pointer to underlying C array.
Definition: DField.h:119
MDE solver for one-direction of one block.
cudaReal * head() const
Return q-field at beginning of block (initial condition).
const cudaReal * q(int i) const
Return q-field at specified step.
Field of real single precision values on an FFT mesh on a device.
Definition: RDField.h:34
Container for wavevector data.
Definition: WaveList.h:31
cudaReal * dkSq() const
Get a pointer to the dkSq array on device.
Definition: WaveList.h:159
int nParameter() const
Get the number of parameters in the unit cell.
Definition: UnitCellBase.h:243
Base template for UnitCell<D> classes, D=1, 2 or 3.
Definition: UnitCell.h:44
A fixed size (static) contiguous array template.
Definition: FArray.h:47
#define UTIL_CHECK(condition)
Assertion macro suitable for serial or parallel production code.
Definition: global.h:68
int nThreads()
Get the number of threads per block for execution.
Definition: ThreadGrid.cu:173
void setThreadsLogical(int nThreadsLogical)
Set the total number of threads required for execution.
Definition: ThreadGrid.cu:80
C++ namespace for polymer self-consistent field theory (PSCF).
Utility classes for scientific computation.
Definition: accumulators.mod:1