@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW ( " Nobody should wait FetchOp. Unexpceted Error " ) ;
}
void FetchOpHandle : : WaitAndMergeCPUTensors ( ) const {
if ( return_merged_ ) {
const auto & tensor_dims = tensors_ [ 0 ] . dims ( ) ;
for ( size_t i = 1 ; i < tensors_ . size ( ) ; i + + ) {
const auto & ele_dims = tensors_ [ i ] . dims ( ) ;
static void CheckDims ( const framework : : DDim & tensor_dims ,
const framework : : DDim & ele_dims , const size_t offset ) {
PADDLE_ENFORCE_EQ (
tensor_dims . size ( ) , ele_dims . size ( ) ,
platform : : errors : : Fatal ( " The dimension sizes of fetched Tensors are "
platform : : errors : : Fatal ( " The dimension sizes of fetched Tensors or "
" the items of fetched LoDTensorArray are "
" different from each other on different "
" devices. And the error is caused by the %zu "
" (th) fetched variable. Please set the "
" parameter `return_merged = False` when you "
" call the `Executor.run()` method. " ,
offset _ ) ) ;
offset ) ) ;
for ( int j = 1 ; j < tensor_dims . size ( ) ; j + + ) {
PADDLE_ENFORCE_EQ (
tensor_dims [ j ] , ele_dims [ j ] ,
platform : : errors : : Fatal ( " The dimensions of fetched Tensors are "
platform : : errors : : Fatal ( " The dimensions of fetched Tensors or "
" the items of fetched LoDTensorArray are "
" different from each other on different "
" devices. And the error is caused by the "
" %zu (th) fetched variable. Please set the "
" parameter `return_merged = False` when "
" you call the `Executor.run()` method. " ,
offset_ ) ) ;
offset ) ) ;
}
}
void FetchOpHandle : : WaitAndMergeCPUFetchVars ( ) const {
if ( return_merged_ ) {
if ( data_is_lod_tensor ( tensors_ [ 0 ] ) ) {
const auto & tensor_dims = boost : : get < LoDTensor > ( tensors_ [ 0 ] ) . dims ( ) ;
for ( size_t i = 1 ; i < tensors_ . size ( ) ; i + + ) {
const auto & ele_dims = boost : : get < LoDTensor > ( tensors_ [ i ] ) . dims ( ) ;
CheckDims ( tensor_dims , ele_dims , offset_ ) ;
}
std : : vector < const LoDTensor * > tensors_ptr ;
tensors_ptr . reserve ( tensors_ . size ( ) ) ;
for ( auto & t : tensors_ ) {
tensors_ptr . emplace_back ( & t ) ;
tensors_ptr . emplace_back ( & boost : : get < LoDTensor > ( t ) ) ;
}
auto & val = boost : : get < FetchList > ( * data_ ) ;
LoDTensor var ;
var . MergeLoDTensor ( tensors_ptr , platform : : CPUPlace ( ) ) ;
val . at ( offset_ ) = std : : move ( var ) ;
} else {
auto & array = boost : : get < LoDTensorArray > ( tensors_ [ 0 ] ) ;
LoDTensorArray tmp_array ;
tmp_array . reserve ( array . size ( ) ) ;
for ( size_t i = 0 ; i < array . size ( ) ; + + i ) {
const auto & tensor_dims = array [ i ] . dims ( ) ;
std : : vector < const LoDTensor * > tensors_ptr ;
tensors_ptr . reserve ( tensors_ . size ( ) ) ;
tensors_ptr . push_back ( & array [ i ] ) ;
for ( size_t j = 1 ; j < tensors_ . size ( ) ; + + j ) {
auto & element = boost : : get < LoDTensorArray > ( tensors_ [ j ] ) ;
const auto & ele_dims = element [ i ] . dims ( ) ;
CheckDims ( tensor_dims , ele_dims , offset_ ) ;
tensors_ptr . push_back ( & element [ i ] ) ;
}
tmp_array . emplace_back ( ) ;
tmp_array . back ( ) . MergeLoDTensor ( tensors_ptr , platform : : CPUPlace ( ) ) ;
}
auto & val = boost : : get < FetchList > ( * data_ ) ;
val . at ( offset_ ) = std : : move ( tmp_array ) ;
}
auto & val = boost : : get < FeedFetchList > ( * data_ ) ;
val . at ( offset_ ) . MergeLoDTensor ( tensors_ptr , platform : : CPUPlace ( ) ) ;
} else {
auto & val = boost : : get < FetchUnmergedList > ( * data_ ) ;
val . at ( offset_ ) = std : : move ( tensors_ ) ;
}
}
static void TransData ( const framework : : LoDTensor & src_item ,
framework : : LoDTensor * dst_item ) {
if ( src_item . IsInitialized ( ) & & src_item . numel ( ) > 0 ) {
if ( platform : : is_gpu_place ( src_item . place ( ) ) ) {
# ifdef PADDLE_WITH_CUDA
TensorCopy ( src_item , platform : : CPUPlace ( ) , dst_item ) ;
# endif
} else {
dst_item - > ShareDataWith ( src_item ) ;
}
} else {
dst_item - > clear ( ) ;
dst_item - > Resize ( { 0 } ) ;
}
dst_item - > set_lod ( src_item . lod ( ) ) ;
}
void FetchOpHandle : : RunImpl ( ) {
platform : : RecordEvent record_event ( Name ( ) ) ;
WaitInputVarGenerated ( platform : : CPUPlace ( ) ) ;
tensors_ . resize ( inputs_ . size ( ) ) ;
platform : : CPUPlace cpu ;
auto & scopes = * local_exec_scopes_ ;
for ( size_t i = 0 ; i < inputs_ . size ( ) ; + + i ) {
@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() {
PADDLE_ENFORCE_NOT_NULL ( var , " Cannot find variable %s in execution scope " ,
var_handle - > name ( ) ) ;
if ( var - > IsType < LoDTensor > ( ) ) {
auto & t = var - > Get < framework : : LoDTensor > ( ) ;
if ( t . IsInitialized ( ) & & t . numel ( ) > 0 ) {
if ( platform : : is_gpu_place ( t . place ( ) ) ) {
# ifdef PADDLE_WITH_CUDA
TensorCopy ( t , cpu , & tensors_ [ i ] ) ;
# endif
auto & item = boost : : get < LoDTensor > ( tensors_ [ i ] ) ;
TransData ( t , & item ) ;
} else {
tensors_ [ i ] . ShareDataWith ( t ) ;
auto & t = var - > Get < framework : : LoDTensorArray > ( ) ;
LoDTensorArray tmp ( t . size ( ) ) ;
tensors_ [ i ] = tmp ;
auto & item = boost : : get < LoDTensorArray > ( tensors_ [ i ] ) ;
for ( size_t j = 0 ; j < t . size ( ) ; + + j ) {
TransData ( t [ j ] , & item [ j ] ) ;
}
} else {
tensors_ [ i ] . clear ( ) ;
tensors_ [ i ] . Resize ( { 0 } ) ;
}
tensors_ [ i ] . set_lod ( t . lod ( ) ) ;
}
this - > WaitAndMergeCPUTensors ( ) ;
this - > WaitAndMergeCPUFetchVars ( ) ;
}
void FetchOpHandle : : WaitInputVarGenerated ( const platform : : Place & place ) {