From eb98e558434ae5d9b93d3a15e766d3b968eb6d6c Mon Sep 17 00:00:00 2001 From: limingqi107 Date: Sun, 27 Dec 2020 15:19:07 +0800 Subject: [PATCH] fix ascend ps cache loss invaild --- mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc | 9 ++++++--- mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h | 2 +- mindspore/ccsrc/ps/ps_cache/ps_cache_basic.h | 2 +- mindspore/ccsrc/ps/ps_cache/ps_cache_manager.cc | 1 + mindspore/nn/layer/embedding.py | 5 ++++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc b/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc index 779f13802d..f91d8cb407 100644 --- a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc +++ b/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.cc @@ -131,15 +131,18 @@ void *AscendPsCache::MallocMemory(size_t size) { return device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(size); } -bool AscendPsCache::MallocConstantMemory(size_t constant_value) { +bool AscendPsCache::MallocConstantMemory(size_t cache_vocab_size) { offset_addr_ = reinterpret_cast(device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(sizeof(int))); MS_ERROR_IF_NULL(offset_addr_); rtMemset(offset_addr_, sizeof(int), 0, sizeof(int)); cache_vocab_size_addr_ = reinterpret_cast(device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(sizeof(int))); MS_ERROR_IF_NULL(cache_vocab_size_addr_); - rtMemset(cache_vocab_size_addr_, sizeof(int), constant_value, sizeof(int)); - return true; + int copy_value = SizeToInt(cache_vocab_size); + if (!CopyHostMemToDevice(cache_vocab_size_addr_, ©_value, sizeof(int))) { + return false; + } + return SynchronizeStream(); } bool AscendPsCache::RecordEvent() { diff --git a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h b/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h index d52a11222a..db4bec840b 100644 --- a/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h +++ b/mindspore/ccsrc/ps/ps_cache/ascend/ascend_ps_cache.h @@ -51,7 +51,7 @@ class AscendPsCache : public PsCacheBasic { ~AscendPsCache() override = default; bool InitDevice(uint32_t device_id, const void *context) override; void *MallocMemory(size_t size) override; - bool MallocConstantMemory(size_t constant_value) override; + bool MallocConstantMemory(size_t cache_vocab_size) override; bool RecordEvent() override; bool SynchronizeEvent() override; bool SynchronizeStream() override; diff --git a/mindspore/ccsrc/ps/ps_cache/ps_cache_basic.h b/mindspore/ccsrc/ps/ps_cache/ps_cache_basic.h index fe2727e7ee..33713bb108 100644 --- a/mindspore/ccsrc/ps/ps_cache/ps_cache_basic.h +++ b/mindspore/ccsrc/ps/ps_cache/ps_cache_basic.h @@ -34,7 +34,7 @@ class PsCacheBasic { virtual ~PsCacheBasic() = default; virtual bool InitDevice(uint32_t device_id, const void *context) = 0; virtual void *MallocMemory(size_t size) = 0; - virtual bool MallocConstantMemory(size_t constant_value) { return true; } + virtual bool MallocConstantMemory(size_t cache_vocab_size) { return true; } virtual bool RecordEvent() = 0; virtual bool SynchronizeEvent() = 0; virtual bool SynchronizeStream() = 0; diff --git a/mindspore/ccsrc/ps/ps_cache/ps_cache_manager.cc b/mindspore/ccsrc/ps/ps_cache/ps_cache_manager.cc index 72fefeffbb..6e2babb52a 100644 --- a/mindspore/ccsrc/ps/ps_cache/ps_cache_manager.cc +++ b/mindspore/ccsrc/ps/ps_cache/ps_cache_manager.cc @@ -674,6 +674,7 @@ bool PsCacheManager::HashSwapHostToDevice(const HashTableInfo &hash_info) { RETURN_IF_FALSE(embedding_device_cache_->cache_->HashSwapIn( hash_table_addr, embedding_device_cache_->hash_swap_value_addr_, embedding_device_cache_->hash_swap_index_addr_, hash_table_size, embedding_size, swap_indices_size)); + RETURN_IF_FALSE(embedding_device_cache_->cache_->SynchronizeStream()); return true; } diff --git a/mindspore/nn/layer/embedding.py b/mindspore/nn/layer/embedding.py index e8361e6b19..bdef289156 100755 --- a/mindspore/nn/layer/embedding.py +++ b/mindspore/nn/layer/embedding.py @@ -168,7 +168,10 @@ class EmbeddingLookup(Cell): max_norm (Union[float, None]): A maximum clipping value. The data type must be float16, float32 or None. Default: None sparse (bool): Using sparse mode. When 'target' is set to 'CPU', 'sparse' has to be true. Default: True. - vocab_cache_size (int): Cache size of the dictionary of embeddings. + vocab_cache_size (int): Cache size of the dictionary of embeddings. Default: 0. It is valid only in + parameter server trainning mode and 'DEVICE' target. And the moment parameter of corresponding + optimizer will also be set to the cache size. In addition, it should be noted that it will cost the 'DEVICE' + memory, so suggests setting a reasonable value to avoid insufficient memory. Inputs: - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.