master sponge performance

pull/14539/head
mamba_ni 4 years ago
parent 18d79d35b6
commit 28811fa958

@ -0,0 +1,51 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh"
__global__ void Crd_To_Uint_Crd(const int atom_numbers, const VECTOR *scale_factor, const VECTOR *crd,
UNSIGNED_INT_VECTOR *uint_crd) {
int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
if (atom_i < atom_numbers) {
uint_crd[atom_i].uint_x = crd[atom_i].x * scale_factor[0].x;
uint_crd[atom_i].uint_y = crd[atom_i].y * scale_factor[0].y;
uint_crd[atom_i].uint_z = crd[atom_i].z * scale_factor[0].z;
/*uint_crd[atom_i].uint_x = 2 * uint_crd[atom_i].uint_x;
uint_crd[atom_i].uint_y = 2 * uint_crd[atom_i].uint_y;
uint_crd[atom_i].uint_z = 2 * uint_crd[atom_i].uint_z;*/
uint_crd[atom_i].uint_x = uint_crd[atom_i].uint_x << 1;
uint_crd[atom_i].uint_y = uint_crd[atom_i].uint_y << 1;
uint_crd[atom_i].uint_z = uint_crd[atom_i].uint_z << 1;
}
}
void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
unsigned int *uint_crd_f, cudaStream_t stream) {
VECTOR *crd = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_f));
VECTOR *crd_to_uint_crd_cof = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_to_uint_crd_cof_f));
UNSIGNED_INT_VECTOR *uint_crd =
const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 128.0), 128, 0, stream>>>(
atom_numbers, crd_to_uint_crd_cof, crd, uint_crd);
return;
}
void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
unsigned int *uint_crd_f, cudaStream_t stream);

@ -0,0 +1,26 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
#include <curand_kernel.h>
#include "runtime/device/gpu/cuda_common.h"
void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
unsigned int *uint_crd_f, cudaStream_t stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_

@ -14,7 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh" #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
#include <vector>
__global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) { __global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < element_numbers) { if (i < element_numbers) {
@ -387,7 +387,7 @@ __global__ void Mul_half(float *src, float *dst) {
} }
} }
void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval, void Neighbor_List_Update(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square, int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd, int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
@ -397,15 +397,22 @@ void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count,
int *is_need_refresh_neighbor_list, cudaStream_t stream) { int *is_need_refresh_neighbor_list, cudaStream_t stream) {
if (not_first_time) { if (not_first_time) {
if (refresh_interval > 0) { if (refresh_interval > 0) {
std::vector<int> refresh_count_list(1);
cudaMemcpyAsync(refresh_count_list.data(), d_refresh_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
int refresh_count = refresh_count_list[0];
if (refresh_count % refresh_interval == 0) { if (refresh_count % refresh_interval == 0) {
Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof); Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
Refresh_Neighbor_List_No_Check( Refresh_Neighbor_List_No_Check(grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length,
grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length, atom_numbers_in_grid_bucket, atom_numbers_in_grid_bucket, grid_length_inverse, atom_in_grid_serial, bucket,
grid_length_inverse, atom_in_grid_serial, bucket, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
reinterpret_cast<VECTOR *>(old_crd), crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd), half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list, excluded_numbers, stream); uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list,
excluded_numbers, stream);
} }
refresh_count += 1; refresh_count += 1;
cudaMemcpyAsync(d_refresh_count, &refresh_count, sizeof(int), cudaMemcpyHostToDevice, stream);
} else { } else {
Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>( Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd), half_skin_square, atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd), half_skin_square,

@ -48,7 +48,7 @@ void Construct_Neighbor_List(int grid_numbers, int max_neighbor_numbers, int *nl
void CopyNeighborListAtomNumber(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream); void CopyNeighborListAtomNumber(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream);
void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval, void Neighbor_List_Update(int grid_numbers, int atom_numbers, int* d_refresh_count, int refresh_interval,
int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square, int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd, int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,

@ -0,0 +1,67 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
__global__ void MD_Iteration_Leap_Frog_With_LiuJian_kernel(const int atom_numbers, const float half_dt, const float dt,
const float exp_gamma, float *inverse_mass,
float *sqrt_mass_inverse, VECTOR *vel, VECTOR *crd,
VECTOR *frc, VECTOR *acc, VECTOR *random_frc,
VECTOR *output) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < atom_numbers) {
acc[i].x = inverse_mass[i] * frc[i].x;
acc[i].y = inverse_mass[i] * frc[i].y;
acc[i].z = inverse_mass[i] * frc[i].z;
vel[i].x = vel[i].x + dt * acc[i].x;
vel[i].y = vel[i].y + dt * acc[i].y;
vel[i].z = vel[i].z + dt * acc[i].z;
output[i].x = crd[i].x + half_dt * vel[i].x;
output[i].y = crd[i].y + half_dt * vel[i].y;
output[i].z = crd[i].z + half_dt * vel[i].z;
vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;
output[i].x = output[i].x + half_dt * vel[i].x;
output[i].y = output[i].y + half_dt * vel[i].y;
output[i].z = output[i].z + half_dt * vel[i].z;
}
}
void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
const float exp_gamma, int float4_numbers, float *inverse_mass,
float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output,
cudaStream_t stream) {
Rand_Normal<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(float4_numbers, rand_state,
reinterpret_cast<float4 *>(rand_frc));
VECTOR *d_vel = reinterpret_cast<VECTOR *>(vel);
VECTOR *d_crd = reinterpret_cast<VECTOR *>(crd);
VECTOR *d_frc = reinterpret_cast<VECTOR *>(frc);
VECTOR *d_acc = reinterpret_cast<VECTOR *>(acc);
VECTOR *d_rand_frc = reinterpret_cast<VECTOR *>(rand_frc);
VECTOR *d_out = reinterpret_cast<VECTOR *>(output);
MD_Iteration_Leap_Frog_With_LiuJian_kernel<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
atom_numbers, half_dt, dt, exp_gamma, inverse_mass, sqrt_mass_inverse, d_vel, d_crd, d_frc, d_acc, d_rand_frc,
d_out);
}

@ -0,0 +1,28 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
#include <curand_kernel.h>
#include "runtime/device/gpu/cuda_common.h"
void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
const float exp_gamma, int float4_numbers, float *inverse_mass,
float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output,
cudaStream_t stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_

@ -0,0 +1,28 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
cudaStream_t stream) {
Setup_Rand_Normal_Kernel<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(float4_numbers,
rand_state, seed);
}
void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
cudaStream_t stream);

@ -0,0 +1,23 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_
#include <curand_kernel.h>
#include "runtime/device/gpu/cuda_common.h"
void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
cudaStream_t stream);
#endif

@ -93,12 +93,13 @@ __global__ void PME_Excluded_Energy_Correction(const int atom_numbers, const UNS
} }
} }
void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC, void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
const float *scaler_f, const int *excluded_list_start, const int *excluded_list, const int *excluded_list_start, const int *excluded_list, const int *excluded_atom_numbers,
const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_correction_ene,
float *d_correction_ene, cudaStream_t stream) { dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
const cufftHandle &PME_plan_c2r, cudaStream_t stream) {
UNSIGNED_INT_VECTOR *uint_crd = UNSIGNED_INT_VECTOR *uint_crd =
const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f)); const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f)); VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
@ -106,97 +107,11 @@ void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float
NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl); NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>( construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a); atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);
std::vector<float> h_box_length(3);
cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
stream);
cudaStreamSynchronize(stream);
VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz); UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz);
UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz); UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz);
VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz); VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq); cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
cufftHandle PME_plan_r2c;
cufftHandle PME_plan_c2r;
cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
cufftSetStream(PME_plan_r2c, stream);
cufftSetStream(PME_plan_c2r, stream);
thread_PME.x = 8;
thread_PME.y = 8;
int PME_Nin = ffty * fftz;
int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
int PME_Nall = fftx * ffty * fftz;
float volume = box_length[0].x * box_length[0].y * box_length[0].z;
UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);
int kx, ky, kz, kxrp, kyrp, kzrp, index;
for (kx = 0; kx < 4; kx++) {
for (ky = 0; ky < 4; ky++) {
for (kz = 0; kz < 4; kz++) {
index = kx * 16 + ky * 4 + kz;
PME_kxyz_cpu[index].uint_x = kx;
PME_kxyz_cpu[index].uint_y = ky;
PME_kxyz_cpu[index].uint_z = kz;
}
}
}
cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
free(PME_kxyz_cpu);
// initial start
float *B1, *B2, *B3, *PME_BC0;
B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));
for (kx = 0; kx < fftx; kx++) {
B1[kx] = getb(kx, fftx, 4);
}
for (ky = 0; ky < ffty; ky++) {
B2[ky] = getb(ky, ffty, 4);
}
for (kz = 0; kz < fftz; kz++) {
B3[kz] = getb(kz, fftz, 4);
}
float mprefactor = PI * PI / -beta / beta;
float msq;
for (kx = 0; kx < fftx; kx++) {
kxrp = kx;
if (kx > fftx / 2) kxrp = fftx - kx;
for (ky = 0; ky < ffty; ky++) {
kyrp = ky;
if (ky > ffty / 2) kyrp = ffty - ky;
for (kz = 0; kz <= fftz / 2; kz++) {
kzrp = kz;
msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
kzrp * kzrp / box_length[0].z / box_length[0].z;
index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
if ((kx + ky + kz) == 0) {
PME_BC0[index] = 0;
} else {
PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
}
PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
}
}
}
cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
free(B1);
free(B2);
free(B3);
free(PME_BC0);
Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz), Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz),
1 << 30); 1 << 30);
@ -226,9 +141,3 @@ void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float
d_correction_ene); d_correction_ene);
return; return;
} }
void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
float *d_correction_ene, cudaStream_t stream);

@ -16,15 +16,15 @@
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
#include <curand_kernel.h> #include <cufft.h>
#include <vector>
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC, void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
const float *scaler_f, const int *excluded_list_start, const int *excluded_list, const int *excluded_list_start, const int *excluded_list, const int *excluded_atom_numbers,
const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_correction_ene,
float *d_correction_ene, cudaStream_t stream); dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
const cufftHandle &PME_plan_c2r, cudaStream_t stream);
#endif #endif

@ -28,7 +28,7 @@ __global__ void PME_BCFQ(cufftComplex *PME_FQ, float *PME_BC, int PME_Nfft) {
__global__ void PME_Final(int *PME_atom_near, const float *charge, const float *PME_Q, VECTOR *force, __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *PME_Q, VECTOR *force,
const VECTOR *PME_frxyz, const UNSIGNED_INT_VECTOR *PME_kxyz, const VECTOR *PME_frxyz, const UNSIGNED_INT_VECTOR *PME_kxyz,
const VECTOR PME_inverse_box_vector, const int atom_numbers) { const _VECTOR PME_inverse_box_vector, const int atom_numbers) {
int atom = blockDim.x * blockIdx.x + threadIdx.x; int atom = blockDim.x * blockIdx.x + threadIdx.x;
if (atom < atom_numbers) { if (atom < atom_numbers) {
int k, kx; int k, kx;
@ -73,8 +73,9 @@ __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *
void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz, void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
const float *box_length_f, const int *uint_crd_f, const float *charge, float *force, const int *uint_crd_f, const float *charge, float *force, int PME_Nin, int PME_Nall,
cudaStream_t stream) { int PME_Nfft, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
const _VECTOR &PME_inverse_box_vector, cudaStream_t stream) {
Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, force, 0.); Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, force, 0.);
UNSIGNED_INT_VECTOR *uint_crd = UNSIGNED_INT_VECTOR *uint_crd =
const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f)); const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
@ -86,98 +87,8 @@ void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float be
VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz); VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
VECTOR *frc = reinterpret_cast<VECTOR *>(force); VECTOR *frc = reinterpret_cast<VECTOR *>(force);
std::vector<float> h_box_length(3);
cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
stream);
cudaStreamSynchronize(stream);
VECTOR *box_length = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(h_box_length.data()));
cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq); cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
VECTOR PME_inverse_box_vector;
PME_inverse_box_vector.x = static_cast<float>(fftx) / box_length[0].x;
PME_inverse_box_vector.y = static_cast<float>(ffty) / box_length[0].y;
PME_inverse_box_vector.z = static_cast<float>(fftz) / box_length[0].z;
cufftHandle PME_plan_r2c;
cufftHandle PME_plan_c2r;
cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
cufftSetStream(PME_plan_r2c, stream);
cufftSetStream(PME_plan_c2r, stream);
thread_PME.x = 8;
thread_PME.y = 8;
int PME_Nin = ffty * fftz;
int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
int PME_Nall = fftx * ffty * fftz;
float volume = box_length[0].x * box_length[0].y * box_length[0].z;
UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);
int kx, ky, kz, kxrp, kyrp, kzrp, index;
for (kx = 0; kx < 4; kx++) {
for (ky = 0; ky < 4; ky++) {
for (kz = 0; kz < 4; kz++) {
index = kx * 16 + ky * 4 + kz;
PME_kxyz_cpu[index].uint_x = kx;
PME_kxyz_cpu[index].uint_y = ky;
PME_kxyz_cpu[index].uint_z = kz;
}
}
}
cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
free(PME_kxyz_cpu);
// initial start
float *B1, *B2, *B3, *PME_BC0;
B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));
for (kx = 0; kx < fftx; kx++) {
B1[kx] = getb(kx, fftx, 4);
}
for (ky = 0; ky < ffty; ky++) {
B2[ky] = getb(ky, ffty, 4);
}
for (kz = 0; kz < fftz; kz++) {
B3[kz] = getb(kz, fftz, 4);
}
float mprefactor = PI * PI / -beta / beta;
float msq;
for (kx = 0; kx < fftx; kx++) {
kxrp = kx;
if (kx > fftx / 2) kxrp = fftx - kx;
for (ky = 0; ky < ffty; ky++) {
kyrp = ky;
if (ky > ffty / 2) kyrp = ffty - ky;
for (kz = 0; kz <= fftz / 2; kz++) {
kzrp = kz;
msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
kzrp * kzrp / box_length[0].z / box_length[0].z;
index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
if ((kx + ky + kz) == 0) {
PME_BC0[index] = 0;
} else {
PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
}
PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
}
}
}
cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
free(B1);
free(B2);
free(B3);
free(PME_BC0);
// initial end // initial end
Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>( Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(
3 * atom_numbers, reinterpret_cast<float *>(frc), 0.); 3 * atom_numbers, reinterpret_cast<float *>(frc), 0.);
@ -198,8 +109,3 @@ void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float be
PME_kxyz, PME_inverse_box_vector, atom_numbers); PME_kxyz, PME_inverse_box_vector, atom_numbers);
return; return;
} }
void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
cudaStream_t stream);

@ -16,13 +16,18 @@
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_ #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
#include <curand_kernel.h> #include <cufft.h>
#include <vector>
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
struct _VECTOR {
float x;
float y;
float z;
};
void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz, void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
const float *box_length_f, const int *uint_crd_f, const float *charge, float *force, const int *uint_crd_f, const float *charge, float *force, int PME_Nin, int PME_Nall,
cudaStream_t stream); int PME_Nfft, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
const _VECTOR &PME_inverse_box_vector, cudaStream_t stream);
#endif #endif

@ -0,0 +1,27 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(
CrdToUintCrd,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeUInt32),
CrdToUintCrdGpuKernel, float, unsigned int)
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,87 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_
#include <cuda_runtime_api.h>
#include <vector>
#include <string>
#include <map>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "runtime/device/gpu/cuda_common.h"
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T, typename T1>
class CrdToUintCrdGpuKernel : public GpuKernel {
public:
CrdToUintCrdGpuKernel() : ele_crd(1) {}
~CrdToUintCrdGpuKernel() override = default;
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
auto shape_crd_to_uint_crd_cof = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
for (size_t i = 0; i < shape_crd_to_uint_crd_cof.size(); i++)
ele_crd_to_uint_crd_cof *= shape_crd_to_uint_crd_cof[i];
for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
InitSizeLists();
return true;
}
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto crd_to_uint_crd_cof = GetDeviceAddress<const T>(inputs, 0);
auto crd = GetDeviceAddress<const T>(inputs, 1);
auto uint_crd = GetDeviceAddress<T1>(outputs, 0);
CrdToUintCrd(atom_numbers, crd_to_uint_crd_cof, crd, uint_crd, reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(ele_crd_to_uint_crd_cof * sizeof(T));
input_size_list_.push_back(ele_crd * sizeof(T));
output_size_list_.push_back(3 * atom_numbers * sizeof(T));
}
private:
size_t ele_crd_to_uint_crd_cof = 1;
size_t ele_crd = 1;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int atom_numbers;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_

@ -38,6 +38,7 @@ MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
.AddInputAttr(kNumberTypeInt32) .AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32) .AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32) .AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32), .AddOutputAttr(kNumberTypeFloat32),
NeighborListUpdateGpuKernel, int, float) NeighborListUpdateGpuKernel, int, float)

@ -36,7 +36,6 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers")); grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers"));
atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers")); atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
refresh_count = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_count"));
refresh_interval = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_interval")); refresh_interval = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_interval"));
not_first_time = static_cast<int>(GetAttr<int64_t>(kernel_node, "not_first_time")); not_first_time = static_cast<int>(GetAttr<int64_t>(kernel_node, "not_first_time"));
Nxy = static_cast<int>(GetAttr<int64_t>(kernel_node, "Nxy")); Nxy = static_cast<int>(GetAttr<int64_t>(kernel_node, "Nxy"));
@ -47,7 +46,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin")); cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin"));
half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin")); half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin"));
cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square")); cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square"));
h_bucket.resize(grid_numbers);
h_gpointer.resize(grid_numbers);
InitSizeLists(); InitSizeLists();
return true; return true;
} }
@ -76,17 +76,18 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
auto excluded_list = GetDeviceAddress<int>(inputs, 15); auto excluded_list = GetDeviceAddress<int>(inputs, 15);
auto excluded_numbers = GetDeviceAddress<int>(inputs, 16); auto excluded_numbers = GetDeviceAddress<int>(inputs, 16);
auto need_refresh_flag = GetDeviceAddress<int>(inputs, 17); auto need_refresh_flag = GetDeviceAddress<int>(inputs, 17);
auto d_refresh_count = GetDeviceAddress<int>(inputs, 18);
GRID_BUCKET *d_bucket = reinterpret_cast<GRID_BUCKET *>(GetDeviceAddress<int>(workspaces, 0)); GRID_BUCKET *d_bucket = reinterpret_cast<GRID_BUCKET *>(GetDeviceAddress<int>(workspaces, 0));
GRID_POINTER *d_gpointer = reinterpret_cast<GRID_POINTER *>(GetDeviceAddress<int>(workspaces, 1)); GRID_POINTER *d_gpointer = reinterpret_cast<GRID_POINTER *>(GetDeviceAddress<int>(workspaces, 1));
NEIGHBOR_LIST *nl = GetDeviceAddress<NEIGHBOR_LIST>(workspaces, 2); NEIGHBOR_LIST *nl = GetDeviceAddress<NEIGHBOR_LIST>(workspaces, 2);
float *half_crd_to_uint_crd_cof = GetDeviceAddress<float>(workspaces, 3); float *half_crd_to_uint_crd_cof = GetDeviceAddress<float>(workspaces, 3);
std::vector<GRID_BUCKET> h_bucket(grid_numbers); // std::vector<GRID_BUCKET> h_bucket(grid_numbers);
for (size_t i = 0; i < h_bucket.size(); i += 1) { for (size_t i = 0; i < h_bucket.size(); i += 1) {
h_bucket[i].atom_serial = bucket + i * max_atom_in_grid_numbers; h_bucket[i].atom_serial = bucket + i * max_atom_in_grid_numbers;
} }
std::vector<GRID_POINTER> h_gpointer(grid_numbers); // std::vector<GRID_POINTER> h_gpointer(grid_numbers);
for (size_t i = 0; i < h_gpointer.size(); i += 1) { for (size_t i = 0; i < h_gpointer.size(); i += 1) {
h_gpointer[i].grid_serial = gpointer + i * 125; h_gpointer[i].grid_serial = gpointer + i * 125;
} }
@ -98,7 +99,7 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl, Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl,
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
Neighbor_List_Update(grid_numbers, atom_numbers, refresh_count, refresh_interval, not_first_time, skin, Nxy, Neighbor_List_Update(grid_numbers, atom_numbers, d_refresh_count, refresh_interval, not_first_time, skin, Nxy,
cutoff_square, cutoff_with_skin_square, grid_N, box_length, atom_numbers_in_grid_bucket, cutoff_square, cutoff_with_skin_square, grid_N, box_length, atom_numbers_in_grid_bucket,
grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof, grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start, half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
@ -132,6 +133,7 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
input_size_list_.push_back(sizeof(int) * excluded_atom_numbers); input_size_list_.push_back(sizeof(int) * excluded_atom_numbers);
input_size_list_.push_back(sizeof(int) * atom_numbers); input_size_list_.push_back(sizeof(int) * atom_numbers);
input_size_list_.push_back(sizeof(int));
input_size_list_.push_back(sizeof(int)); input_size_list_.push_back(sizeof(int));
workspace_size_list_.push_back(sizeof(GRID_BUCKET) * grid_numbers); workspace_size_list_.push_back(sizeof(GRID_BUCKET) * grid_numbers);
@ -148,7 +150,6 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
int not_first_time; int not_first_time;
int atom_numbers; int atom_numbers;
int grid_numbers; int grid_numbers;
int refresh_count;
int refresh_interval; int refresh_interval;
int Nxy; int Nxy;
int max_atom_in_grid_numbers; int max_atom_in_grid_numbers;
@ -163,6 +164,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;
std::vector<GRID_BUCKET> h_bucket;
std::vector<GRID_POINTER> h_gpointer;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

@ -45,14 +45,6 @@ class MDIterationLeapFrogGpuKernel : public GpuKernel {
is_max_velocity = static_cast<int>(GetAttr<int64_t>(kernel_node, "is_max_velocity")); is_max_velocity = static_cast<int>(GetAttr<int64_t>(kernel_node, "is_max_velocity"));
max_velocity = static_cast<float>(GetAttr<float>(kernel_node, "max_velocity")); max_velocity = static_cast<float>(GetAttr<float>(kernel_node, "max_velocity"));
// printf("float4_numbers: %d", float4_numbers);
// printf("atom_numbers: %d", atom_numbers);
// printf("half_dt: %f", half_dt);
// printf("dt: %f", dt);
// printf("exp_gamma: %f", exp_gamma);
// printf("is_max_velocity: %d", is_max_velocity);
// printf("max_velocity: %f", max_velocity);
auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape_qrt_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); auto shape_qrt_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(MDIterationLeapFrogLiujian,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
MDIterationLeapFrogLiujianCudaGpuKernel, float, int)
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,100 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_KERNEL_H_
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh"
#include <cuda_runtime_api.h>
#include <map>
#include <string>
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "runtime/device/gpu/cuda_common.h"
namespace mindspore {
namespace kernel {
template <typename T, typename T1>
class MDIterationLeapFrogLiujianCudaGpuKernel : public GpuKernel {
public:
MDIterationLeapFrogLiujianCudaGpuKernel() {}
~MDIterationLeapFrogLiujianCudaGpuKernel() override = default;
bool Init(const CNodePtr &kernel_node) override {
// get bond_numbers
kernel_node_ = kernel_node;
atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
half_dt = static_cast<float>(GetAttr<float>(kernel_node, "half_dt"));
dt = static_cast<float>(GetAttr<float>(kernel_node, "dt"));
exp_gamma = static_cast<float>(GetAttr<float>(kernel_node, "exp_gamma"));
float4_numbers = ceil(3. * static_cast<double>(atom_numbers) / 4.);
InitSizeLists();
return true;
}
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto inverse_mass = GetDeviceAddress<float>(inputs, 0);
auto sqrt_mass_inverse = GetDeviceAddress<float>(inputs, 1);
auto vel = GetDeviceAddress<float>(inputs, 2);
auto crd = GetDeviceAddress<float>(inputs, 3);
auto frc = GetDeviceAddress<float>(inputs, 4);
auto acc = GetDeviceAddress<float>(inputs, 5);
auto rand_state = GetDeviceAddress<float>(inputs, 6);
auto rand_frc = GetDeviceAddress<float>(inputs, 7);
auto output = GetDeviceAddress<float>(outputs, 0);
MD_Iteration_Leap_Frog_With_LiuJian(atom_numbers, half_dt, dt, exp_gamma, float4_numbers, inverse_mass,
sqrt_mass_inverse, vel, crd, frc, acc,
reinterpret_cast<curandStatePhilox4_32_10_t *>(rand_state), rand_frc, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(atom_numbers * sizeof(float));
input_size_list_.push_back(atom_numbers * sizeof(float));
input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
input_size_list_.push_back(float4_numbers * sizeof(curandStatePhilox4_32_10_t));
input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
}
private:
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int atom_numbers;
float half_dt;
float dt;
float exp_gamma;
int float4_numbers;
};
} // namespace kernel
} // namespace mindspore
#endif

@ -0,0 +1,25 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(MDIterationSetupRandState, KernelAttr().AddOutputAttr(kNumberTypeFloat32),
MDIterationSetupRandStateGpuKernel, float, int)
} // namespace kernel
} // namespace mindspore

@ -0,0 +1,74 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_SETUP_RANDOM_STATE_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_SETUP_RANDOM_STATE_GPU_KERNEL_H_
#include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh"
#include <cuda_runtime_api.h>
#include <map>
#include <string>
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "runtime/device/gpu/cuda_common.h"
namespace mindspore {
namespace kernel {
template <typename T, typename T1>
class MDIterationSetupRandStateGpuKernel : public GpuKernel {
public:
MDIterationSetupRandStateGpuKernel() {}
~MDIterationSetupRandStateGpuKernel() override = default;
bool Init(const CNodePtr &kernel_node) override {
// get bond_numbers
kernel_node_ = kernel_node;
atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
seed = static_cast<int>(GetAttr<int64_t>(kernel_node, "seed"));
float4_numbers = ceil(3. * static_cast<double>(atom_numbers) / 4.);
InitSizeLists();
return true;
}
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto output = GetDeviceAddress<float>(outputs, 0);
curandStatePhilox4_32_10_t *rand_state = reinterpret_cast<curandStatePhilox4_32_10_t *>(output);
MD_Iteration_Setup_Random_State(float4_numbers, rand_state, seed, reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
protected:
void InitSizeLists() override { output_size_list_.push_back(sizeof(curandStatePhilox4_32_10_t) * float4_numbers); }
private:
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int atom_numbers;
int seed;
int float4_numbers;
};
} // namespace kernel
} // namespace mindspore
#endif

@ -19,7 +19,6 @@ namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_TWO(PMEEnergy, MS_REG_GPU_KERNEL_TWO(PMEEnergy,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeUInt32) .AddInputAttr(kNumberTypeUInt32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32) .AddInputAttr(kNumberTypeInt32)

@ -18,8 +18,6 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <cufft.h> #include <cufft.h>
#include <vector> #include <vector>
#include <string>
#include <map>
#include "backend/kernel_compiler/gpu/gpu_kernel.h" #include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
@ -40,8 +38,76 @@ class PMEEnergyGpuKernel : public GpuKernel {
fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx")); fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty")); ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz")); fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
PME_Nall = fftx * ffty * fftz;
float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
std::vector<float> h_box_length(3);
h_box_length[0] = box_length_0;
h_box_length[1] = box_length_1;
h_box_length[2] = box_length_2;
VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
_thread_PME.x = 8;
_thread_PME.y = 8;
PME_Nin = ffty * fftz;
PME_Nfft = fftx * ffty * (fftz / 2 + 1); PME_Nfft = fftx * ffty * (fftz / 2 + 1);
PME_Nall = fftx * ffty * fftz;
PME_kxyz_cpu.resize(64);
volume = box_length[0].x * box_length[0].y * box_length[0].z;
int kx, ky, kz, kxrp, kyrp, kzrp, index;
for (kx = 0; kx < 4; kx++) {
for (ky = 0; ky < 4; ky++) {
for (kz = 0; kz < 4; kz++) {
index = kx * 16 + ky * 4 + kz;
PME_kxyz_cpu[index].uint_x = kx;
PME_kxyz_cpu[index].uint_y = ky;
PME_kxyz_cpu[index].uint_z = kz;
}
}
}
B1.resize(fftx);
B2.resize(ffty);
B3.resize(fftz);
PME_BC0.resize(PME_Nfft);
for (kx = 0; kx < fftx; kx++) {
B1[kx] = getb(kx, fftx, 4);
}
for (ky = 0; ky < ffty; ky++) {
B2[ky] = getb(ky, ffty, 4);
}
for (kz = 0; kz < fftz; kz++) {
B3[kz] = getb(kz, fftz, 4);
}
float mprefactor = PI * PI / -beta / beta;
float msq;
for (kx = 0; kx < fftx; kx++) {
kxrp = kx;
if (kx > fftx / 2) kxrp = fftx - kx;
for (ky = 0; ky < ffty; ky++) {
kyrp = ky;
if (ky > ffty / 2) kyrp = ffty - ky;
for (kz = 0; kz <= fftz / 2; kz++) {
kzrp = kz;
msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
kzrp * kzrp / box_length[0].z / box_length[0].z;
index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
if ((kx + ky + kz) == 0) {
PME_BC0[index] = 0;
} else {
PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
}
PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
}
}
}
InitSizeLists(); InitSizeLists();
return true; return true;
@ -53,15 +119,14 @@ class PMEEnergyGpuKernel : public GpuKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override { const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto boxlength = GetDeviceAddress<T>(inputs, 0); auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
auto uint_crd = GetDeviceAddress<T1>(inputs, 1); auto charge = GetDeviceAddress<T>(inputs, 1);
auto charge = GetDeviceAddress<T>(inputs, 2); auto nl_numbers = GetDeviceAddress<T1>(inputs, 2);
auto nl_numbers = GetDeviceAddress<T1>(inputs, 3); auto nl_serial = GetDeviceAddress<T1>(inputs, 3);
auto nl_serial = GetDeviceAddress<T1>(inputs, 4); auto scaler = GetDeviceAddress<T>(inputs, 4);
auto scaler = GetDeviceAddress<T>(inputs, 5); auto excluded_list_start = GetDeviceAddress<int>(inputs, 5);
auto excluded_list_start = GetDeviceAddress<int>(inputs, 6); auto excluded_list = GetDeviceAddress<int>(inputs, 6);
auto excluded_list = GetDeviceAddress<int>(inputs, 7); auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 7);
auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 8);
auto pme_uxyz = GetDeviceAddress<int>(workspace, 0); // workspace auto pme_uxyz = GetDeviceAddress<int>(workspace, 0); // workspace
auto pme_frxyz = GetDeviceAddress<float>(workspace, 1); // workspace auto pme_frxyz = GetDeviceAddress<float>(workspace, 1); // workspace
@ -77,16 +142,22 @@ class PMEEnergyGpuKernel : public GpuKernel {
auto direct_ene = GetDeviceAddress<T>(outputs, 2); auto direct_ene = GetDeviceAddress<T>(outputs, 2);
auto correction_ene = GetDeviceAddress<T>(outputs, 3); auto correction_ene = GetDeviceAddress<T>(outputs, 3);
PMEEnergy(fftx, ffty, fftz, atom_numbers, beta, boxlength, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
pme_atom_near, pme_kxyz, uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start, cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
excluded_list, excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene, cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
cudaMemcpyAsync(pme_bc, PME_BC0.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr));
PMEEnergy(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near, pme_kxyz,
uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start, excluded_list,
excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene, _thread_PME, PME_Nin,
PME_Nfft, PME_Nall, PME_plan_r2c, PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
return true; return true;
} }
protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(sizeof(VECTOR));
input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR)); input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
input_size_list_.push_back(atom_numbers * sizeof(VECTOR)); input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
input_size_list_.push_back(atom_numbers * sizeof(T1)); input_size_list_.push_back(atom_numbers * sizeof(T1));
@ -112,12 +183,56 @@ class PMEEnergyGpuKernel : public GpuKernel {
output_size_list_.push_back(sizeof(T)); output_size_list_.push_back(sizeof(T));
} }
cufftComplex expc(cufftComplex z) {
cufftComplex res;
float t = expf(z.x);
sincosf(z.y, &res.y, &res.x);
res.x *= t;
res.y *= t;
return res;
}
float M_(float u, int n) {
if (n == 2) {
if (u > 2 || u < 0) return 0;
return 1 - abs(u - 1);
} else {
return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
}
}
float getb(int k, int NFFT, int B_order) {
cufftComplex tempc, tempc2, res;
float tempf;
tempc2.x = 0;
tempc2.y = 0;
tempc.x = 0;
tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
res = expc(tempc);
for (int kk = 0; kk < (B_order - 1); kk++) {
tempc.x = 0;
tempc.y = 2 * PI * k / NFFT * kk;
tempc = expc(tempc);
tempf = M_(kk + 1, B_order);
tempc2.x += tempf * tempc.x;
tempc2.y += tempf * tempc.y;
}
res = cuCdivf(res, tempc2);
return res.x * res.x + res.y * res.y;
}
private: private:
size_t ele_uint_crd = 1; size_t ele_uint_crd = 1;
std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;
std::vector<float> B1;
std::vector<float> B2;
std::vector<float> B3;
std::vector<float> PME_BC0;
int atom_numbers; int atom_numbers;
int excluded_numbers; int excluded_numbers;
int max_nl_numbers = 800; int max_nl_numbers = 800;
@ -125,8 +240,16 @@ class PMEEnergyGpuKernel : public GpuKernel {
int ffty; int ffty;
int fftz; int fftz;
float beta; float beta;
int PME_Nin;
int PME_Nall; int PME_Nall;
int PME_Nfft; int PME_Nfft;
float volume;
float PI = 3.1415926;
cufftHandle PME_plan_r2c;
cufftHandle PME_plan_c2r;
dim3 _thread_PME;
struct VECTOR { struct VECTOR {
float x; float x;
float y; float y;
@ -138,7 +261,7 @@ class PMEEnergyGpuKernel : public GpuKernel {
unsigned int uint_y; unsigned int uint_y;
unsigned int uint_z; unsigned int uint_z;
}; };
std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
struct NEIGHBOR_LIST { struct NEIGHBOR_LIST {
int atom_numbers; int atom_numbers;
int *atom_serial; int *atom_serial;

@ -17,13 +17,10 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_TWO(PMEReciprocalForce, MS_REG_GPU_KERNEL_TWO(
KernelAttr() PMEReciprocalForce,
.AddInputAttr(kNumberTypeFloat32) KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
.AddInputAttr(kNumberTypeUInt32) PMEReciprocalForceGpuKernel, float, int)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
PMEReciprocalForceGpuKernel, float, int)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

@ -41,6 +41,75 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz")); fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
PME_Nall = fftx * ffty * fftz; PME_Nall = fftx * ffty * fftz;
PME_Nfft = fftx * ffty * (fftz / 2 + 1); PME_Nfft = fftx * ffty * (fftz / 2 + 1);
PME_Nin = ffty * fftz;
float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
std::vector<float> h_box_length(3);
h_box_length[0] = box_length_0;
h_box_length[1] = box_length_1;
h_box_length[2] = box_length_2;
VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
PME_inverse_box_vector.x = static_cast<float>(fftx) / box_length[0].x;
PME_inverse_box_vector.y = static_cast<float>(ffty) / box_length[0].y;
PME_inverse_box_vector.z = static_cast<float>(fftz) / box_length[0].z;
cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
float volume = box_length[0].x * box_length[0].y * box_length[0].z;
PME_kxyz_cpu.resize(64);
int kx, ky, kz, kxrp, kyrp, kzrp, index;
for (kx = 0; kx < 4; kx++) {
for (ky = 0; ky < 4; ky++) {
for (kz = 0; kz < 4; kz++) {
index = kx * 16 + ky * 4 + kz;
PME_kxyz_cpu[index].uint_x = kx;
PME_kxyz_cpu[index].uint_y = ky;
PME_kxyz_cpu[index].uint_z = kz;
}
}
}
B1.resize(fftx);
B2.resize(ffty);
B3.resize(fftz);
PME_BC0.resize(PME_Nfft);
for (kx = 0; kx < fftx; kx++) {
B1[kx] = getb(kx, fftx, 4);
}
for (ky = 0; ky < ffty; ky++) {
B2[ky] = getb(ky, ffty, 4);
}
for (kz = 0; kz < fftz; kz++) {
B3[kz] = getb(kz, fftz, 4);
}
float mprefactor = PI * PI / -beta / beta;
float msq;
for (kx = 0; kx < fftx; kx++) {
kxrp = kx;
if (kx > fftx / 2) kxrp = fftx - kx;
for (ky = 0; ky < ffty; ky++) {
kyrp = ky;
if (ky > ffty / 2) kyrp = ffty - ky;
for (kz = 0; kz <= fftz / 2; kz++) {
kzrp = kz;
msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
kzrp * kzrp / box_length[0].z / box_length[0].z;
index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
if ((kx + ky + kz) == 0) {
PME_BC0[index] = 0;
} else {
PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
}
PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
}
}
}
InitSizeLists(); InitSizeLists();
return true; return true;
@ -52,9 +121,8 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override { const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto boxlength = GetDeviceAddress<T>(inputs, 0); auto uint_crd = GetDeviceAddress<const T1>(inputs, 0);
auto uint_crd = GetDeviceAddress<const T1>(inputs, 1); auto charge = GetDeviceAddress<T>(inputs, 1);
auto charge = GetDeviceAddress<T>(inputs, 2);
auto pme_uxyz = GetDeviceAddress<int>(workspace, 0); // workspace auto pme_uxyz = GetDeviceAddress<int>(workspace, 0); // workspace
auto pme_frxyz = GetDeviceAddress<float>(workspace, 1); // workspace auto pme_frxyz = GetDeviceAddress<float>(workspace, 1); // workspace
@ -65,9 +133,15 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
auto pme_kxyz = GetDeviceAddress<int>(workspace, 6); // workspace auto pme_kxyz = GetDeviceAddress<int>(workspace, 6); // workspace
auto force = GetDeviceAddress<T>(outputs, 0); auto force = GetDeviceAddress<T>(outputs, 0);
cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr));
cudaMemcpyAsync(pme_bc, PME_BC0.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr));
PMEReciprocalForce(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near, PMEReciprocalForce(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near,
pme_kxyz, boxlength, uint_crd, charge, force, reinterpret_cast<cudaStream_t>(stream_ptr)); pme_kxyz, uint_crd, charge, force, PME_Nin, PME_Nall, PME_Nfft, PME_plan_r2c, PME_plan_c2r,
PME_inverse_box_vector, reinterpret_cast<cudaStream_t>(stream_ptr));
return true; return true;
} }
@ -88,6 +162,44 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
output_size_list_.push_back(atom_numbers * sizeof(VECTOR)); output_size_list_.push_back(atom_numbers * sizeof(VECTOR));
} }
cufftComplex expc(cufftComplex z) {
cufftComplex res;
float t = expf(z.x);
sincosf(z.y, &res.y, &res.x);
res.x *= t;
res.y *= t;
return res;
}
float M_(float u, int n) {
if (n == 2) {
if (u > 2 || u < 0) return 0;
return 1 - abs(u - 1);
} else {
return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
}
}
float getb(int k, int NFFT, int B_order) {
cufftComplex tempc, tempc2, res;
float tempf;
tempc2.x = 0;
tempc2.y = 0;
tempc.x = 0;
tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
res = expc(tempc);
for (int kk = 0; kk < (B_order - 1); kk++) {
tempc.x = 0;
tempc.y = 2 * PI * k / NFFT * kk;
tempc = expc(tempc);
tempf = M_(kk + 1, B_order);
tempc2.x += tempf * tempc.x;
tempc2.y += tempf * tempc.y;
}
res = cuCdivf(res, tempc2);
return res.x * res.x + res.y * res.y;
}
private: private:
size_t ele_uint_crd = 1; size_t ele_uint_crd = 1;
@ -101,18 +213,27 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
float beta; float beta;
int PME_Nall; int PME_Nall;
int PME_Nfft; int PME_Nfft;
int PME_Nin;
float PI = 3.1415926;
std::vector<float> B1;
std::vector<float> B2;
std::vector<float> B3;
std::vector<float> PME_BC0;
cufftHandle PME_plan_r2c;
cufftHandle PME_plan_c2r;
struct VECTOR { struct VECTOR {
float x; float x;
float y; float y;
float z; float z;
}; };
_VECTOR PME_inverse_box_vector;
struct UNSIGNED_INT_VECTOR { struct UNSIGNED_INT_VECTOR {
unsigned int uint_x; unsigned int uint_x;
unsigned int uint_y; unsigned int uint_y;
unsigned int uint_z; unsigned int uint_z;
}; };
std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save