test=develop
@ -361,9 +361,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
if (lazy_mode) {
std::vector<int64_t> id_vector;
size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = rows[row_index] * row_numel + offset;
size_t i = cpu_rows[row_index] * row_numel + offset;
id_vector.push_back(i);
}
@ -128,7 +128,7 @@ struct ForRangeIn<CUDADeviceContext> {
int grid_size = (range_.size() + num_threads - 1) / num_threads;
ForRangeInElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, range_.data(), range_size);
func, range_.CUDAData(dev_ctx_.GetPlace()), range_size);
const CUDADeviceContext& dev_ctx_;