master sponge performance

4 years ago · 28811fa958
parent 18d79d35b6
commit 28811fa958
40 changed files with 4381 additions and 3689 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cu
@ -0,0 +1,51 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh"
 __global__ void Crd_To_Uint_Crd(const int atom_numbers, const VECTOR *scale_factor, const VECTOR *crd,
                                UNSIGNED_INT_VECTOR *uint_crd) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    uint_crd[atom_i].uint_x = crd[atom_i].x * scale_factor[0].x;
    uint_crd[atom_i].uint_y = crd[atom_i].y * scale_factor[0].y;
    uint_crd[atom_i].uint_z = crd[atom_i].z * scale_factor[0].z;
    /*uint_crd[atom_i].uint_x = 2 * uint_crd[atom_i].uint_x;
    uint_crd[atom_i].uint_y = 2 * uint_crd[atom_i].uint_y;
    uint_crd[atom_i].uint_z = 2 * uint_crd[atom_i].uint_z;*/
    uint_crd[atom_i].uint_x = uint_crd[atom_i].uint_x << 1;
    uint_crd[atom_i].uint_y = uint_crd[atom_i].uint_y << 1;
    uint_crd[atom_i].uint_z = uint_crd[atom_i].uint_z << 1;
  }
 }
 void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
                  unsigned int *uint_crd_f, cudaStream_t stream) {
  VECTOR *crd = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_f));
  VECTOR *crd_to_uint_crd_cof = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_to_uint_crd_cof_f));
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 128.0), 128, 0, stream>>>(
    atom_numbers, crd_to_uint_crd_cof, crd, uint_crd);
  return;
 }
 void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
                  unsigned int *uint_crd_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh
@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"
 void CrdToUintCrd(const int atom_numbers, const float *crd_to_uint_crd_cof_f, const float *crd_f,
                  unsigned int *uint_crd_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_CRD_TO_UINT_CRD_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cu
@ -14,7 +14,7 @@
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
-
+#include <vector>
 __global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
@ -387,7 +387,7 @@ __global__ void Mul_half(float *src, float *dst) {
  }
 }
-void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval,
+void Neighbor_List_Update(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
@ -397,15 +397,22 @@ void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count,
                          int *is_need_refresh_neighbor_list, cudaStream_t stream) {
  if (not_first_time) {
    if (refresh_interval > 0) {
      std::vector<int> refresh_count_list(1);
      cudaMemcpyAsync(refresh_count_list.data(), d_refresh_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
      cudaStreamSynchronize(stream);
      int refresh_count = refresh_count_list[0];
      if (refresh_count % refresh_interval == 0) {
        Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
-        Refresh_Neighbor_List_No_Check(
+        Refresh_Neighbor_List_No_Check(grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length,
-          grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length, atom_numbers_in_grid_bucket,
+                                       atom_numbers_in_grid_bucket, grid_length_inverse, atom_in_grid_serial, bucket,
-          grid_length_inverse, atom_in_grid_serial, bucket, reinterpret_cast<VECTOR *>(crd),
+                                       reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
-          reinterpret_cast<VECTOR *>(old_crd), crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
+                                       half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
-          uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list, excluded_numbers, stream);
+                                       uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list,
                                       excluded_numbers, stream);
      }
      refresh_count += 1;
      cudaMemcpyAsync(d_refresh_count, &refresh_count, sizeof(int), cudaMemcpyHostToDevice, stream);
    } else {
      Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
        atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd), half_skin_square,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
@ -48,7 +48,7 @@ void Construct_Neighbor_List(int grid_numbers, int max_neighbor_numbers, int *nl
 void CopyNeighborListAtomNumber(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream);
-void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval,
+void Neighbor_List_Update(int grid_numbers, int atom_numbers, int* d_refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cu
@ -0,0 +1,67 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 __global__ void MD_Iteration_Leap_Frog_With_LiuJian_kernel(const int atom_numbers, const float half_dt, const float dt,
                                                           const float exp_gamma, float *inverse_mass,
                                                           float *sqrt_mass_inverse, VECTOR *vel, VECTOR *crd,
                                                           VECTOR *frc, VECTOR *acc, VECTOR *random_frc,
                                                           VECTOR *output) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < atom_numbers) {
    acc[i].x = inverse_mass[i] * frc[i].x;
    acc[i].y = inverse_mass[i] * frc[i].y;
    acc[i].z = inverse_mass[i] * frc[i].z;
    vel[i].x = vel[i].x + dt * acc[i].x;
    vel[i].y = vel[i].y + dt * acc[i].y;
    vel[i].z = vel[i].z + dt * acc[i].z;
    output[i].x = crd[i].x + half_dt * vel[i].x;
    output[i].y = crd[i].y + half_dt * vel[i].y;
    output[i].z = crd[i].z + half_dt * vel[i].z;
    vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
    vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
    vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;
    output[i].x = output[i].x + half_dt * vel[i].x;
    output[i].y = output[i].y + half_dt * vel[i].y;
    output[i].z = output[i].z + half_dt * vel[i].z;
  }
 }
 void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
                                         const float exp_gamma, int float4_numbers, float *inverse_mass,
                                         float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
                                         curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output,
                                         cudaStream_t stream) {
  Rand_Normal<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(float4_numbers, rand_state,
                                                                                 reinterpret_cast<float4 *>(rand_frc));
  VECTOR *d_vel = reinterpret_cast<VECTOR *>(vel);
  VECTOR *d_crd = reinterpret_cast<VECTOR *>(crd);
  VECTOR *d_frc = reinterpret_cast<VECTOR *>(frc);
  VECTOR *d_acc = reinterpret_cast<VECTOR *>(acc);
  VECTOR *d_rand_frc = reinterpret_cast<VECTOR *>(rand_frc);
  VECTOR *d_out = reinterpret_cast<VECTOR *>(output);
  MD_Iteration_Leap_Frog_With_LiuJian_kernel<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, half_dt, dt, exp_gamma, inverse_mass, sqrt_mass_inverse, d_vel, d_crd, d_frc, d_acc, d_rand_frc,
    d_out);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh
@ -0,0 +1,28 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"
 void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
                                         const float exp_gamma, int float4_numbers, float *inverse_mass,
                                         float *sqrt_mass_inverse, float *vel, float *crd, float *frc, float *acc,
                                         curandStatePhilox4_32_10_t *rand_state, float *rand_frc, float *output,
                                         cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cu
@ -0,0 +1,28 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
                                     cudaStream_t stream) {
  Setup_Rand_Normal_Kernel<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(float4_numbers,
                                                                                              rand_state, seed);
 }
 void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
                                     cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh
@ -0,0 +1,23 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_MD_ITERATION_SETUP_RANDOM_STATE_GPU_IMPL_H_
 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"
 void MD_Iteration_Setup_Random_State(int float4_numbers, curandStatePhilox4_32_10_t *rand_state, int seed,
                                     cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
@ -93,12 +93,13 @@ __global__ void PME_Excluded_Energy_Correction(const int atom_numbers, const UNS
  }
 }
-void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
+void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
-               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
+               float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
-               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
+               const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
-               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
+               const int *excluded_list_start, const int *excluded_list, const int *excluded_atom_numbers,
-               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
+               float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_correction_ene,
-               float *d_correction_ene, cudaStream_t stream) {
+               dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
               const cufftHandle &PME_plan_c2r, cudaStream_t stream) {
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
@ -106,97 +107,11 @@ void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);
  std::vector<float> h_box_length(3);
  cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
                  stream);
  cudaStreamSynchronize(stream);
  VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
  UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz);
  UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz);
  VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
  cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
  cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
  cufftSetStream(PME_plan_r2c, stream);
  cufftSetStream(PME_plan_c2r, stream);
  thread_PME.x = 8;
  thread_PME.y = 8;
  int PME_Nin = ffty * fftz;
  int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
  int PME_Nall = fftx * ffty * fftz;
  float volume = box_length[0].x * box_length[0].y * box_length[0].z;
  UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
  Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);
  int kx, ky, kz, kxrp, kyrp, kzrp, index;
  for (kx = 0; kx < 4; kx++) {
    for (ky = 0; ky < 4; ky++) {
      for (kz = 0; kz < 4; kz++) {
        index = kx * 16 + ky * 4 + kz;
        PME_kxyz_cpu[index].uint_x = kx;
        PME_kxyz_cpu[index].uint_y = ky;
        PME_kxyz_cpu[index].uint_z = kz;
      }
    }
  }
  cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(PME_kxyz_cpu);
  // initial start
  float *B1, *B2, *B3, *PME_BC0;
  B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
  B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
  B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
  PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));
  for (kx = 0; kx < fftx; kx++) {
    B1[kx] = getb(kx, fftx, 4);
  }
  for (ky = 0; ky < ffty; ky++) {
    B2[ky] = getb(ky, ffty, 4);
  }
  for (kz = 0; kz < fftz; kz++) {
    B3[kz] = getb(kz, fftz, 4);
  }
  float mprefactor = PI * PI / -beta / beta;
  float msq;
  for (kx = 0; kx < fftx; kx++) {
    kxrp = kx;
    if (kx > fftx / 2) kxrp = fftx - kx;
    for (ky = 0; ky < ffty; ky++) {
      kyrp = ky;
      if (ky > ffty / 2) kyrp = ffty - ky;
      for (kz = 0; kz <= fftz / 2; kz++) {
        kzrp = kz;
        msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
              kzrp * kzrp / box_length[0].z / box_length[0].z;
        index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
        if ((kx + ky + kz) == 0) {
          PME_BC0[index] = 0;
        } else {
          PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
        }
        PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
      }
    }
  }
  cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(B1);
  free(B2);
  free(B3);
  free(PME_BC0);
  Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz),
                                                           1 << 30);
@ -226,9 +141,3 @@ void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float
    d_correction_ene);
  return;
 }
 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
               float *d_correction_ene, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh
@ -16,15 +16,15 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
-#include <curand_kernel.h>
+#include <cufft.h>
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"
-void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
+void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
-               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
+               float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz, const int *uint_crd_f,
-               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
+               const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *scaler_f,
-               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
+               const int *excluded_list_start, const int *excluded_list, const int *excluded_atom_numbers,
-               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
+               float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene, float *d_correction_ene,
-               float *d_correction_ene, cudaStream_t stream);
+               dim3 thread_PME, int PME_Nin, int PME_Nfft, int PME_Nall, const cufftHandle &PME_plan_r2c,
               const cufftHandle &PME_plan_c2r, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
@ -28,7 +28,7 @@ __global__ void PME_BCFQ(cufftComplex *PME_FQ, float *PME_BC, int PME_Nfft) {
 __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *PME_Q, VECTOR *force,
                          const VECTOR *PME_frxyz, const UNSIGNED_INT_VECTOR *PME_kxyz,
-                          const VECTOR PME_inverse_box_vector, const int atom_numbers) {
+                          const _VECTOR PME_inverse_box_vector, const int atom_numbers) {
  int atom = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom < atom_numbers) {
    int k, kx;
@ -73,8 +73,9 @@ __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *
 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
-                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
+                        const int *uint_crd_f, const float *charge, float *force, int PME_Nin, int PME_Nall,
-                        cudaStream_t stream) {
+                        int PME_Nfft, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
                        const _VECTOR &PME_inverse_box_vector, cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, force, 0.);
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
@ -86,98 +87,8 @@ void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float be
  VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
  VECTOR *frc = reinterpret_cast<VECTOR *>(force);
  std::vector<float> h_box_length(3);
  cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
                  stream);
  cudaStreamSynchronize(stream);
  VECTOR *box_length = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(h_box_length.data()));
  cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
  VECTOR PME_inverse_box_vector;
  PME_inverse_box_vector.x = static_cast<float>(fftx) / box_length[0].x;
  PME_inverse_box_vector.y = static_cast<float>(ffty) / box_length[0].y;
  PME_inverse_box_vector.z = static_cast<float>(fftz) / box_length[0].z;
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
  cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
  cufftSetStream(PME_plan_r2c, stream);
  cufftSetStream(PME_plan_c2r, stream);
  thread_PME.x = 8;
  thread_PME.y = 8;
  int PME_Nin = ffty * fftz;
  int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
  int PME_Nall = fftx * ffty * fftz;
  float volume = box_length[0].x * box_length[0].y * box_length[0].z;
  UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
  Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);
  int kx, ky, kz, kxrp, kyrp, kzrp, index;
  for (kx = 0; kx < 4; kx++) {
    for (ky = 0; ky < 4; ky++) {
      for (kz = 0; kz < 4; kz++) {
        index = kx * 16 + ky * 4 + kz;
        PME_kxyz_cpu[index].uint_x = kx;
        PME_kxyz_cpu[index].uint_y = ky;
        PME_kxyz_cpu[index].uint_z = kz;
      }
    }
  }
  cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(PME_kxyz_cpu);
  // initial start
  float *B1, *B2, *B3, *PME_BC0;
  B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
  B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
  B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
  PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));
  for (kx = 0; kx < fftx; kx++) {
    B1[kx] = getb(kx, fftx, 4);
  }
  for (ky = 0; ky < ffty; ky++) {
    B2[ky] = getb(ky, ffty, 4);
  }
  for (kz = 0; kz < fftz; kz++) {
    B3[kz] = getb(kz, fftz, 4);
  }
  float mprefactor = PI * PI / -beta / beta;
  float msq;
  for (kx = 0; kx < fftx; kx++) {
    kxrp = kx;
    if (kx > fftx / 2) kxrp = fftx - kx;
    for (ky = 0; ky < ffty; ky++) {
      kyrp = ky;
      if (ky > ffty / 2) kyrp = ffty - ky;
      for (kz = 0; kz <= fftz / 2; kz++) {
        kzrp = kz;
        msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
              kzrp * kzrp / box_length[0].z / box_length[0].z;
        index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
        if ((kx + ky + kz) == 0) {
          PME_BC0[index] = 0;
        } else {
          PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
        }
        PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
      }
    }
  }
  cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(B1);
  free(B2);
  free(B3);
  free(PME_BC0);
  // initial end
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(
    3 * atom_numbers, reinterpret_cast<float *>(frc), 0.);
@ -198,8 +109,3 @@ void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float be
                                                                        PME_kxyz, PME_inverse_box_vector, atom_numbers);
  return;
 }
 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
                        cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
@ -16,13 +16,18 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
-#include <curand_kernel.h>
+#include <cufft.h>
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"
 struct _VECTOR {
  float x;
  float y;
  float z;
 };
 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
-                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
+                        const int *uint_crd_f, const float *charge, float *force, int PME_Nin, int PME_Nall,
-                        cudaStream_t stream);
+                        int PME_Nfft, const cufftHandle &PME_plan_r2c, const cufftHandle &PME_plan_c2r,
                        const _VECTOR &PME_inverse_box_vector, cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.cc
@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.h"
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(
  CrdToUintCrd,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeUInt32),
  CrdToUintCrdGpuKernel, float, unsigned int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/crd_to_uint_crd_kernel.h
@ -0,0 +1,87 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/crd_to_uint_crd_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class CrdToUintCrdGpuKernel : public GpuKernel {
 public:
  CrdToUintCrdGpuKernel() : ele_crd(1) {}
  ~CrdToUintCrdGpuKernel() override = default;
  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    auto shape_crd_to_uint_crd_cof = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    for (size_t i = 0; i < shape_crd_to_uint_crd_cof.size(); i++)
      ele_crd_to_uint_crd_cof *= shape_crd_to_uint_crd_cof[i];
    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
    InitSizeLists();
    return true;
  }
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto crd_to_uint_crd_cof = GetDeviceAddress<const T>(inputs, 0);
    auto crd = GetDeviceAddress<const T>(inputs, 1);
    auto uint_crd = GetDeviceAddress<T1>(outputs, 0);
    CrdToUintCrd(atom_numbers, crd_to_uint_crd_cof, crd, uint_crd, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_crd_to_uint_crd_cof * sizeof(T));
    input_size_list_.push_back(ele_crd * sizeof(T));
    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
  }
 private:
  size_t ele_crd_to_uint_crd_cof = 1;
  size_t ele_crd = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_CRD_TO_UINT_CRD_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.cc
@ -38,6 +38,7 @@ MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      NeighborListUpdateGpuKernel, int, float)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h
@ -36,7 +36,6 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
  bool Init(const CNodePtr &kernel_node) override {
    grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    refresh_count = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_count"));
    refresh_interval = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_interval"));
    not_first_time = static_cast<int>(GetAttr<int64_t>(kernel_node, "not_first_time"));
    Nxy = static_cast<int>(GetAttr<int64_t>(kernel_node, "Nxy"));
@ -47,7 +46,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin"));
    half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin"));
    cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square"));
-
+    h_bucket.resize(grid_numbers);
    h_gpointer.resize(grid_numbers);
    InitSizeLists();
    return true;
  }
@ -76,17 +76,18 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    auto excluded_list = GetDeviceAddress<int>(inputs, 15);
    auto excluded_numbers = GetDeviceAddress<int>(inputs, 16);
    auto need_refresh_flag = GetDeviceAddress<int>(inputs, 17);
    auto d_refresh_count = GetDeviceAddress<int>(inputs, 18);
    GRID_BUCKET *d_bucket = reinterpret_cast<GRID_BUCKET *>(GetDeviceAddress<int>(workspaces, 0));
    GRID_POINTER *d_gpointer = reinterpret_cast<GRID_POINTER *>(GetDeviceAddress<int>(workspaces, 1));
    NEIGHBOR_LIST *nl = GetDeviceAddress<NEIGHBOR_LIST>(workspaces, 2);
    float *half_crd_to_uint_crd_cof = GetDeviceAddress<float>(workspaces, 3);
-    std::vector<GRID_BUCKET> h_bucket(grid_numbers);
+    // std::vector<GRID_BUCKET> h_bucket(grid_numbers);
    for (size_t i = 0; i < h_bucket.size(); i += 1) {
      h_bucket[i].atom_serial = bucket + i * max_atom_in_grid_numbers;
    }
-    std::vector<GRID_POINTER> h_gpointer(grid_numbers);
+    // std::vector<GRID_POINTER> h_gpointer(grid_numbers);
    for (size_t i = 0; i < h_gpointer.size(); i += 1) {
      h_gpointer[i].grid_serial = gpointer + i * 125;
    }
@ -98,7 +99,7 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl,
                            reinterpret_cast<cudaStream_t>(stream_ptr));
-    Neighbor_List_Update(grid_numbers, atom_numbers, refresh_count, refresh_interval, not_first_time, skin, Nxy,
+    Neighbor_List_Update(grid_numbers, atom_numbers, d_refresh_count, refresh_interval, not_first_time, skin, Nxy,
                         cutoff_square, cutoff_with_skin_square, grid_N, box_length, atom_numbers_in_grid_bucket,
                         grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
                         half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
@ -132,6 +133,7 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    input_size_list_.push_back(sizeof(int) * excluded_atom_numbers);
    input_size_list_.push_back(sizeof(int) * atom_numbers);
    input_size_list_.push_back(sizeof(int));
    input_size_list_.push_back(sizeof(int));
    workspace_size_list_.push_back(sizeof(GRID_BUCKET) * grid_numbers);
@ -148,7 +150,6 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
  int not_first_time;
  int atom_numbers;
  int grid_numbers;
  int refresh_count;
  int refresh_interval;
  int Nxy;
  int max_atom_in_grid_numbers;
@ -163,6 +164,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  std::vector<GRID_BUCKET> h_bucket;
  std::vector<GRID_POINTER> h_gpointer;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
@ -45,14 +45,6 @@ class MDIterationLeapFrogGpuKernel : public GpuKernel {
    is_max_velocity = static_cast<int>(GetAttr<int64_t>(kernel_node, "is_max_velocity"));
    max_velocity = static_cast<float>(GetAttr<float>(kernel_node, "max_velocity"));
    // printf("float4_numbers: %d", float4_numbers);
    // printf("atom_numbers: %d", atom_numbers);
    // printf("half_dt: %f", half_dt);
    // printf("dt: %f", dt);
    // printf("exp_gamma: %f", exp_gamma);
    // printf("is_max_velocity: %d", is_max_velocity);
    // printf("max_velocity: %f", max_velocity);
    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_qrt_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.cc
@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h"
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(MDIterationLeapFrogLiujian,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      MDIterationLeapFrogLiujianCudaGpuKernel, float, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_kernel.h
@ -0,0 +1,100 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_LEAP_FROG_LIUJIAN_GPU_KERNEL_H_
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_liujian_gpu_impl.cuh"
 #include <cuda_runtime_api.h>
 #include <map>
 #include <string>
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class MDIterationLeapFrogLiujianCudaGpuKernel : public GpuKernel {
 public:
  MDIterationLeapFrogLiujianCudaGpuKernel() {}
  ~MDIterationLeapFrogLiujianCudaGpuKernel() override = default;
  bool Init(const CNodePtr &kernel_node) override {
    // get bond_numbers
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    half_dt = static_cast<float>(GetAttr<float>(kernel_node, "half_dt"));
    dt = static_cast<float>(GetAttr<float>(kernel_node, "dt"));
    exp_gamma = static_cast<float>(GetAttr<float>(kernel_node, "exp_gamma"));
    float4_numbers = ceil(3. * static_cast<double>(atom_numbers) / 4.);
    InitSizeLists();
    return true;
  }
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto inverse_mass = GetDeviceAddress<float>(inputs, 0);
    auto sqrt_mass_inverse = GetDeviceAddress<float>(inputs, 1);
    auto vel = GetDeviceAddress<float>(inputs, 2);
    auto crd = GetDeviceAddress<float>(inputs, 3);
    auto frc = GetDeviceAddress<float>(inputs, 4);
    auto acc = GetDeviceAddress<float>(inputs, 5);
    auto rand_state = GetDeviceAddress<float>(inputs, 6);
    auto rand_frc = GetDeviceAddress<float>(inputs, 7);
    auto output = GetDeviceAddress<float>(outputs, 0);
    MD_Iteration_Leap_Frog_With_LiuJian(atom_numbers, half_dt, dt, exp_gamma, float4_numbers, inverse_mass,
                                        sqrt_mass_inverse, vel, crd, frc, acc,
                                        reinterpret_cast<curandStatePhilox4_32_10_t *>(rand_state), rand_frc, output,
                                        reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(atom_numbers * sizeof(float));
    input_size_list_.push_back(atom_numbers * sizeof(float));
    input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
    input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
    input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
    input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
    input_size_list_.push_back(float4_numbers * sizeof(curandStatePhilox4_32_10_t));
    input_size_list_.push_back(atom_numbers * 3 * sizeof(float));
    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }
 private:
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  float half_dt;
  float dt;
  float exp_gamma;
  int float4_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.cc
@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.h"
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(MDIterationSetupRandState, KernelAttr().AddOutputAttr(kNumberTypeFloat32),
                      MDIterationSetupRandStateGpuKernel, float, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_setup_random_state.h
@ -0,0 +1,74 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_SETUP_RANDOM_STATE_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MD_ITERATION_SETUP_RANDOM_STATE_GPU_KERNEL_H_
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_setup_random_state_gpu_impl.cuh"
 #include <cuda_runtime_api.h>
 #include <map>
 #include <string>
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class MDIterationSetupRandStateGpuKernel : public GpuKernel {
 public:
  MDIterationSetupRandStateGpuKernel() {}
  ~MDIterationSetupRandStateGpuKernel() override = default;
  bool Init(const CNodePtr &kernel_node) override {
    // get bond_numbers
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    seed = static_cast<int>(GetAttr<int64_t>(kernel_node, "seed"));
    float4_numbers = ceil(3. * static_cast<double>(atom_numbers) / 4.);
    InitSizeLists();
    return true;
  }
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto output = GetDeviceAddress<float>(outputs, 0);
    curandStatePhilox4_32_10_t *rand_state = reinterpret_cast<curandStatePhilox4_32_10_t *>(output);
    MD_Iteration_Setup_Random_State(float4_numbers, rand_state, seed, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
 protected:
  void InitSizeLists() override { output_size_list_.push_back(sizeof(curandStatePhilox4_32_10_t) * float4_numbers); }
 private:
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  int seed;
  int float4_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.cc
@ -19,7 +19,6 @@ namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(PMEEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.h
@ -18,8 +18,6 @@
 #include <cuda_runtime_api.h>
 #include <cufft.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
@ -40,8 +38,76 @@ class PMEEnergyGpuKernel : public GpuKernel {
    fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
    ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
-    PME_Nall = fftx * ffty * fftz;
+
    float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
    float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
    float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
    std::vector<float> h_box_length(3);
    h_box_length[0] = box_length_0;
    h_box_length[1] = box_length_1;
    h_box_length[2] = box_length_2;
    VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
    cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
    cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
    _thread_PME.x = 8;
    _thread_PME.y = 8;
    PME_Nin = ffty * fftz;
    PME_Nfft = fftx * ffty * (fftz / 2 + 1);
    PME_Nall = fftx * ffty * fftz;
    PME_kxyz_cpu.resize(64);
    volume = box_length[0].x * box_length[0].y * box_length[0].z;
    int kx, ky, kz, kxrp, kyrp, kzrp, index;
    for (kx = 0; kx < 4; kx++) {
      for (ky = 0; ky < 4; ky++) {
        for (kz = 0; kz < 4; kz++) {
          index = kx * 16 + ky * 4 + kz;
          PME_kxyz_cpu[index].uint_x = kx;
          PME_kxyz_cpu[index].uint_y = ky;
          PME_kxyz_cpu[index].uint_z = kz;
        }
      }
    }
    B1.resize(fftx);
    B2.resize(ffty);
    B3.resize(fftz);
    PME_BC0.resize(PME_Nfft);
    for (kx = 0; kx < fftx; kx++) {
      B1[kx] = getb(kx, fftx, 4);
    }
    for (ky = 0; ky < ffty; ky++) {
      B2[ky] = getb(ky, ffty, 4);
    }
    for (kz = 0; kz < fftz; kz++) {
      B3[kz] = getb(kz, fftz, 4);
    }
    float mprefactor = PI * PI / -beta / beta;
    float msq;
    for (kx = 0; kx < fftx; kx++) {
      kxrp = kx;
      if (kx > fftx / 2) kxrp = fftx - kx;
      for (ky = 0; ky < ffty; ky++) {
        kyrp = ky;
        if (ky > ffty / 2) kyrp = ffty - ky;
        for (kz = 0; kz <= fftz / 2; kz++) {
          kzrp = kz;
          msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
                kzrp * kzrp / box_length[0].z / box_length[0].z;
          index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
          if ((kx + ky + kz) == 0) {
            PME_BC0[index] = 0;
          } else {
            PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
          }
          PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
        }
      }
    }
    InitSizeLists();
    return true;
@ -53,15 +119,14 @@ class PMEEnergyGpuKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto boxlength = GetDeviceAddress<T>(inputs, 0);
+    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
-    auto uint_crd = GetDeviceAddress<T1>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 1);
-    auto charge = GetDeviceAddress<T>(inputs, 2);
+    auto nl_numbers = GetDeviceAddress<T1>(inputs, 2);
-    auto nl_numbers = GetDeviceAddress<T1>(inputs, 3);
+    auto nl_serial = GetDeviceAddress<T1>(inputs, 3);
-    auto nl_serial = GetDeviceAddress<T1>(inputs, 4);
+    auto scaler = GetDeviceAddress<T>(inputs, 4);
-    auto scaler = GetDeviceAddress<T>(inputs, 5);
+    auto excluded_list_start = GetDeviceAddress<int>(inputs, 5);
-    auto excluded_list_start = GetDeviceAddress<int>(inputs, 6);
+    auto excluded_list = GetDeviceAddress<int>(inputs, 6);
-    auto excluded_list = GetDeviceAddress<int>(inputs, 7);
+    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 7);
    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 8);
    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
@ -77,16 +142,22 @@ class PMEEnergyGpuKernel : public GpuKernel {
    auto direct_ene = GetDeviceAddress<T>(outputs, 2);
    auto correction_ene = GetDeviceAddress<T>(outputs, 3);
-    PMEEnergy(fftx, ffty, fftz, atom_numbers, beta, boxlength, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq,
+    cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
-              pme_atom_near, pme_kxyz, uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start,
+    cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
-              excluded_list, excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene,
+    cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
-              reinterpret_cast<cudaStream_t>(stream_ptr));
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
    cudaMemcpyAsync(pme_bc, PME_BC0.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr));
    PMEEnergy(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near, pme_kxyz,
              uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start, excluded_list,
              excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene, _thread_PME, PME_Nin,
              PME_Nfft, PME_Nall, PME_plan_r2c, PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
@ -112,12 +183,56 @@ class PMEEnergyGpuKernel : public GpuKernel {
    output_size_list_.push_back(sizeof(T));
  }
  cufftComplex expc(cufftComplex z) {
    cufftComplex res;
    float t = expf(z.x);
    sincosf(z.y, &res.y, &res.x);
    res.x *= t;
    res.y *= t;
    return res;
  }
  float M_(float u, int n) {
    if (n == 2) {
      if (u > 2 || u < 0) return 0;
      return 1 - abs(u - 1);
    } else {
      return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
    }
  }
  float getb(int k, int NFFT, int B_order) {
    cufftComplex tempc, tempc2, res;
    float tempf;
    tempc2.x = 0;
    tempc2.y = 0;
    tempc.x = 0;
    tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
    res = expc(tempc);
    for (int kk = 0; kk < (B_order - 1); kk++) {
      tempc.x = 0;
      tempc.y = 2 * PI * k / NFFT * kk;
      tempc = expc(tempc);
      tempf = M_(kk + 1, B_order);
      tempc2.x += tempf * tempc.x;
      tempc2.y += tempf * tempc.y;
    }
    res = cuCdivf(res, tempc2);
    return res.x * res.x + res.y * res.y;
  }
 private:
  size_t ele_uint_crd = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  std::vector<float> B1;
  std::vector<float> B2;
  std::vector<float> B3;
  std::vector<float> PME_BC0;
  int atom_numbers;
  int excluded_numbers;
  int max_nl_numbers = 800;
@ -125,8 +240,16 @@ class PMEEnergyGpuKernel : public GpuKernel {
  int ffty;
  int fftz;
  float beta;
  int PME_Nin;
  int PME_Nall;
  int PME_Nfft;
  float volume;
  float PI = 3.1415926;
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  dim3 _thread_PME;
  struct VECTOR {
    float x;
    float y;
@ -138,7 +261,7 @@ class PMEEnergyGpuKernel : public GpuKernel {
    unsigned int uint_y;
    unsigned int uint_z;
  };
-
+  std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
  struct NEIGHBOR_LIST {
    int atom_numbers;
    int *atom_serial;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.cc
@ -17,13 +17,10 @@
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(PMEReciprocalForce,
+MS_REG_GPU_KERNEL_TWO(
-                      KernelAttr()
+  PMEReciprocalForce,
-                        .AddInputAttr(kNumberTypeFloat32)
+  KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                        .AddInputAttr(kNumberTypeUInt32)
+  PMEReciprocalForceGpuKernel, float, int)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      PMEReciprocalForceGpuKernel, float, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
@ -41,6 +41,75 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
    PME_Nall = fftx * ffty * fftz;
    PME_Nfft = fftx * ffty * (fftz / 2 + 1);
    PME_Nin = ffty * fftz;
    float box_length_0 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_0"));
    float box_length_1 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_1"));
    float box_length_2 = static_cast<float>(GetAttr<float_t>(kernel_node, "box_length_2"));
    std::vector<float> h_box_length(3);
    h_box_length[0] = box_length_0;
    h_box_length[1] = box_length_1;
    h_box_length[2] = box_length_2;
    VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());
    PME_inverse_box_vector.x = static_cast<float>(fftx) / box_length[0].x;
    PME_inverse_box_vector.y = static_cast<float>(ffty) / box_length[0].y;
    PME_inverse_box_vector.z = static_cast<float>(fftz) / box_length[0].z;
    cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
    cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
    float volume = box_length[0].x * box_length[0].y * box_length[0].z;
    PME_kxyz_cpu.resize(64);
    int kx, ky, kz, kxrp, kyrp, kzrp, index;
    for (kx = 0; kx < 4; kx++) {
      for (ky = 0; ky < 4; ky++) {
        for (kz = 0; kz < 4; kz++) {
          index = kx * 16 + ky * 4 + kz;
          PME_kxyz_cpu[index].uint_x = kx;
          PME_kxyz_cpu[index].uint_y = ky;
          PME_kxyz_cpu[index].uint_z = kz;
        }
      }
    }
    B1.resize(fftx);
    B2.resize(ffty);
    B3.resize(fftz);
    PME_BC0.resize(PME_Nfft);
    for (kx = 0; kx < fftx; kx++) {
      B1[kx] = getb(kx, fftx, 4);
    }
    for (ky = 0; ky < ffty; ky++) {
      B2[ky] = getb(ky, ffty, 4);
    }
    for (kz = 0; kz < fftz; kz++) {
      B3[kz] = getb(kz, fftz, 4);
    }
    float mprefactor = PI * PI / -beta / beta;
    float msq;
    for (kx = 0; kx < fftx; kx++) {
      kxrp = kx;
      if (kx > fftx / 2) kxrp = fftx - kx;
      for (ky = 0; ky < ffty; ky++) {
        kyrp = ky;
        if (ky > ffty / 2) kyrp = ffty - ky;
        for (kz = 0; kz <= fftz / 2; kz++) {
          kzrp = kz;
          msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
                kzrp * kzrp / box_length[0].z / box_length[0].z;
          index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
          if ((kx + ky + kz) == 0) {
            PME_BC0[index] = 0;
          } else {
            PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
          }
          PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
        }
      }
    }
    InitSizeLists();
    return true;
@ -52,9 +121,8 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto boxlength = GetDeviceAddress<T>(inputs, 0);
+    auto uint_crd = GetDeviceAddress<const T1>(inputs, 0);
-    auto uint_crd = GetDeviceAddress<const T1>(inputs, 1);
+    auto charge = GetDeviceAddress<T>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
@ -65,9 +133,15 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
    auto pme_kxyz = GetDeviceAddress<int>(workspace, 6);       // workspace
    auto force = GetDeviceAddress<T>(outputs, 0);
-
+    cufftSetStream(PME_plan_r2c, reinterpret_cast<cudaStream_t>(stream_ptr));
    cufftSetStream(PME_plan_c2r, reinterpret_cast<cudaStream_t>(stream_ptr));
    cudaMemcpyAsync(pme_kxyz, PME_kxyz_cpu.data(), sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr));
    cudaMemcpyAsync(pme_bc, PME_BC0.data(), sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr));
    PMEReciprocalForce(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near,
-                       pme_kxyz, boxlength, uint_crd, charge, force, reinterpret_cast<cudaStream_t>(stream_ptr));
+                       pme_kxyz, uint_crd, charge, force, PME_Nin, PME_Nall, PME_Nfft, PME_plan_r2c, PME_plan_c2r,
                       PME_inverse_box_vector, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
@ -88,6 +162,44 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
    output_size_list_.push_back(atom_numbers * sizeof(VECTOR));
  }
  cufftComplex expc(cufftComplex z) {
    cufftComplex res;
    float t = expf(z.x);
    sincosf(z.y, &res.y, &res.x);
    res.x *= t;
    res.y *= t;
    return res;
  }
  float M_(float u, int n) {
    if (n == 2) {
      if (u > 2 || u < 0) return 0;
      return 1 - abs(u - 1);
    } else {
      return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
    }
  }
  float getb(int k, int NFFT, int B_order) {
    cufftComplex tempc, tempc2, res;
    float tempf;
    tempc2.x = 0;
    tempc2.y = 0;
    tempc.x = 0;
    tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
    res = expc(tempc);
    for (int kk = 0; kk < (B_order - 1); kk++) {
      tempc.x = 0;
      tempc.y = 2 * PI * k / NFFT * kk;
      tempc = expc(tempc);
      tempf = M_(kk + 1, B_order);
      tempc2.x += tempf * tempc.x;
      tempc2.y += tempf * tempc.y;
    }
    res = cuCdivf(res, tempc2);
    return res.x * res.x + res.y * res.y;
  }
 private:
  size_t ele_uint_crd = 1;
@ -101,18 +213,27 @@ class PMEReciprocalForceGpuKernel : public GpuKernel {
  float beta;
  int PME_Nall;
  int PME_Nfft;
  int PME_Nin;
  float PI = 3.1415926;
  std::vector<float> B1;
  std::vector<float> B2;
  std::vector<float> B3;
  std::vector<float> PME_BC0;
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  struct VECTOR {
    float x;
    float y;
    float z;
  };
-
+  _VECTOR PME_inverse_box_vector;
  struct UNSIGNED_INT_VECTOR {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
  };
  std::vector<UNSIGNED_INT_VECTOR> PME_kxyz_cpu;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/Show More
+++ b/Show More