|
|
|
@ -64,7 +64,7 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
|
|
|
|
|
void DatasetIteratorKernel::InitSizeLists() { return; }
|
|
|
|
|
|
|
|
|
|
bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
|
|
|
|
const std::vector<AddressPtr> &outputs, void *) {
|
|
|
|
|
const std::vector<AddressPtr> &outputs, void *stream) {
|
|
|
|
|
void *addr = nullptr;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
|
|
|
|
@ -96,11 +96,14 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < output_size_list_.size(); i++) {
|
|
|
|
|
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpy(outputs[i]->addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice),
|
|
|
|
|
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs[i]->addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
|
|
|
|
|
reinterpret_cast<cudaStream_t>(stream)),
|
|
|
|
|
"Cuda Memcpy Failed");
|
|
|
|
|
addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)),
|
|
|
|
|
"cudaStreamSynchronize failed");
|
|
|
|
|
(void)GpuBufferMgr::GetInstance().Pop(handle_);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|