1#ifndef PSPG_KERNEL_WRAPPERS_CU
2#define PSPG_KERNEL_WRAPPERS_CU
4#include "KernelWrappers.h"
10__host__ cudaReal gpuSum(cudaReal
const * d_in,
int size)
16 int halvedsize = ceil((
float)size/2);
22 cudaReal* temp_ =
new cudaReal[
nBlocks];
24 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
30 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
33 cudaReal sum = 0, tempVal, tempSum;
35 for (
int i = 0; i <
nBlocks; ++i) {
36 tempVal = temp_[i] - err;
37 tempSum = sum + tempVal;
38 err = tempSum - sum - tempVal;
45__host__ cudaReal gpuInnerProduct(cudaReal
const * d_a, cudaReal
const * d_b,
int size)
51 int halvedsize = ceil((
float)size/2);
57 cudaReal* temp_ =
new cudaReal[
nBlocks];
59 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
65 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
68 cudaReal sum = 0, tempVal, tempSum;
70 for (
int i = 0; i <
nBlocks; ++i) {
71 tempVal = temp_[i] - err;
72 tempSum = sum + tempVal;
73 err = tempSum - sum - tempVal;
80__host__ cudaReal gpuMax(cudaReal
const * d_in,
int size)
85 int halvedsize = ceil((
float)size/2);
91 cudaReal* temp_ =
new cudaReal[
nBlocks];
93 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
99 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
103 for (
int i = 0; i <
nBlocks; i++) {
104 if (temp_[i] > max) max = temp_[i];
110__host__ cudaReal gpuMaxAbs(cudaReal
const * d_in,
int size)
115 int halvedsize = ceil((
float)size/2);
121 cudaReal* temp_ =
new cudaReal[
nBlocks];
123 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
129 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
134 for (
int i = 0; i <
nBlocks; i++) {
135 if (temp_[i] > max) max = temp_[i];
141__host__ cudaReal gpuMin(cudaReal
const * d_in,
int size)
146 int halvedsize = ceil((
float)size/2);
152 cudaReal* temp_ =
new cudaReal[
nBlocks];
154 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
160 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
163 cudaReal min = temp_[0];
164 for (
int i = 1; i <
nBlocks; i++) {
165 if (temp_[i] < min) min = temp_[i];
171__host__ cudaReal gpuMinAbs(cudaReal
const * d_in,
int size)
176 int halvedsize = ceil((
float)size/2);
182 cudaReal* temp_ =
new cudaReal[
nBlocks];
184 gpuErrchk(cudaMalloc((
void**) &d_temp_, nBlocks*
sizeof(cudaReal)));
190 gpuErrchk(cudaMemcpy(temp_, d_temp_, nBlocks*
sizeof(cudaReal), cudaMemcpyDeviceToHost));
193 cudaReal min = temp_[0];
194 for (
int i = 1; i <
nBlocks; i++) {
195 if (temp_[i] < min) min = temp_[i];
int nThreads()
Get the number of threads per block for execution.
int nBlocks()
Get the current number of blocks for execution.
void setThreadsLogical(int nThreadsLogical)
Set the total number of threads required for execution.
C++ namespace for polymer self-consistent field theory (PSCF).