synchronize latest Ascend software suite 18 Jul 2020, and merging branches

pull/3198/head
yanghaoran 5 years ago
commit 859acc6d2a

2
.gitmodules vendored

@ -15,4 +15,4 @@
url = https://gitee.com/mindspore/akg.git
[submodule "graphengine"]
path = graphengine
url = https://gitee.com/ms-incubator/graphengine.git
url = https://gitee.com/mindspore/graphengine.git

@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
### Communication
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
- IRC channel at `#mindspore` (only for meeting minutes logging purpose)
- Video Conferencing: https://meet.jit.si
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
- Video Conferencing: TBD
- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
## Contributing

2
akg

@ -1 +1 @@
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625

@ -24,7 +24,7 @@ usage()
{
echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
echo ""
echo "Options:"
@ -48,7 +48,6 @@ usage()
echo " -P Enable dump anf graph to file in ProtoBuffer format, default on"
echo " -Q Enable dump memory, default off"
echo " -D Enable dumping of function graph ir, default on"
echo " -S Enable async data dump, default off"
echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
@ -89,7 +88,6 @@ checkopts()
ENABLE_TIMELINE="off"
ENABLE_DUMP2PROTO="on"
ENABLE_DUMPE2E="off"
ENABLE_DATA_DUMP="off"
ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on"
ENABLE_MPI="off"
@ -104,7 +102,7 @@ checkopts()
ENABLE_PYTHON="on"
# Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in
@ -186,6 +184,7 @@ checkopts()
elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
ENABLE_D="on"
ENABLE_CPU="on"
ENABLE_SERVING="on"
elif [[ "X$OPTARG" == "Xcpu" ]]; then
ENABLE_CPU="on"
else
@ -220,11 +219,6 @@ checkopts()
ENABLE_DUMPE2E="$OPTARG"
echo "enable dump end to end"
;;
S)
check_on_off $OPTARG S
ENABLE_DATA_DUMP="$OPTARG"
echo "enable data dump"
;;
D)
check_on_off $OPTARG D
ENABLE_DUMP_IR="$OPTARG"
@ -328,9 +322,6 @@ build_mindspore()
if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
fi
if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
fi
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
if [[ "X$ENABLE_MPI" = "Xon" ]]; then

@ -1,4 +1,4 @@
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
mindspore_add_pkg(glog
VER 0.4.0

@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
add_compile_definitions(ENABLE_DUMP_E2E)
endif()
if(ENABLE_DATA_DUMP)
add_compile_definitions(ENABLE_DATA_DUMP)
endif()
if(ENABLE_DEBUGGER)
add_compile_definitions(ENABLE_DEBUGGER)
endif()
if(ENABLE_TESTCASES)
add_compile_definitions(ENABLE_TESTCASES)
endif()

@ -1,13 +1,16 @@
# find exec
find_package(Python3 3.7 COMPONENTS Interpreter Development)
if (NOT Python3_FOUND)
message("No python3 found.")
return ()
message(FATAL_ERROR "No python3 found.")
endif ()
set(PYTHON ${Python3_EXECUTABLE})
set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})
if (NOT PYTHON_VERSION MATCHES "3.7")
message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
endif ()
find_package(Git)
if (NOT GIT_FOUND)
message("No git found.")

@ -1 +1 @@
Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009

@ -17,7 +17,7 @@
"""Resources for ast tree parse."""
import ast
import math
from mindspore import IndexedSlices
from mindspore import IndexedSlices, SparseTensor
from mindspore.ops.composite import multitype_ops
from mindspore.ops import functional as F, composite as C
from . import standard_method as M
@ -140,4 +140,5 @@ convert_object_map = {
# user defined
IndexedSlices: F.make_indexed_slices,
SparseTensor: F.make_sparse_tensor,
}

@ -44,7 +44,7 @@ if(ENABLE_GPU)
"backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
)
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
"runtime/device/gpu/distribution/collective_wrapper.cc"

@ -26,14 +26,6 @@ if (ENABLE_CPU)
"cpu/*.cc"
)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc"
"cpu/ps/pull_kernel.cc"
"cpu/ps/embedding_look_up_ps_kernel.cc"
"cpu/ps/embedding_look_up_proxy_kernel.cc"
"cpu/ps/apply_momentum_ps_kernel.cc"
"cpu/ps/sparse_apply_adam_ps_kernel.cc"
"cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
if (NOT ENABLE_MPI)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@ -41,6 +33,17 @@ if (ENABLE_CPU)
endif ()
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
endif()
if (ENABLE_GPU)
file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"gpu/*.cu"

@ -18,6 +18,7 @@
#include <algorithm>
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/optimizer/common/helper.h"
#include "backend/kernel_compiler/common_utils.h"
namespace mindspore {
namespace kernel {
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
std::string dst_type;
TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
if (output_type == kFloat32->type_id()) {
dst_type = "float32";
} else if (output_type == kFloat16->type_id()) {
dst_type = "float16";
} else if (output_type == kInt32->type_id()) {
dst_type = "int32";
} else {
MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
}
dst_type = TypeId2String(output_type);
AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
}

@ -21,9 +21,7 @@
#include <memory>
#include "framework/ge_runtime/task_info.h"
#include "backend/kernel_compiler/kernel.h"
#ifdef ENABLE_DATA_DUMP
#include "debug/data_dump_parser.h"
#endif
using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
namespace mindspore {
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
const std::vector<AddressPtr> &, uint32_t) = 0;
uint32_t block_dim() { return block_dim_; }
uint32_t stream_id() { return stream_id_; }
virtual bool NeedDump() {
#ifdef ENABLE_DATA_DUMP
return DataDumpParser::GetInstance().NeedDump(kernel_name_);
#else
return false;
#endif
}
virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }
protected:
uint32_t block_dim_{1};

File diff suppressed because it is too large Load Diff

@ -73,9 +73,18 @@ class KernelMeta {
};
struct SparseGradient {
float *value_;
int *indices_;
size_t indices_size_;
float *value_{nullptr};
int *indices_{nullptr};
size_t indices_size_{0};
};
struct ReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
bool use_sort_reduce_{false};
};
struct MultiThreadComputeParams {
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
std::string GetProcessor(const AnfNodePtr &anf_node);
bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
int Sign(float x);
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim, bool use_multi_threads = true);
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
const std::vector<AnfNodePtr> &input_list);
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
bool IsWeightBoundary(const AnfNodePtr &node);
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
size_t total_compute_size);
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions);
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
} // namespace kernel
} // namespace mindspore

@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
protected:
void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
float **output_addr);
void CheckParam(const CNodePtr &kernel_node);

@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
size_t output_size = outputs[0]->size;
size_t size = input_size / sizeof(float);
::ps::SArray<float> lookup_ids(size, 0);
::ps::SArray<int> lookup_ids(size, 0);
::ps::SArray<int> lengths{size};
::ps::SArray<float> lookup_result;
::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);
auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
}
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
parallel::ps::kEmbeddingLookupCmd);
auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);

@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
split_num_ = pserver_num_;
// input shape should be sharded after computing offset_;
Shard(input_shape_, axis_);
Shard(&input_shape_, axis_);
size_t output_size =
std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());

@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
MS_REG_CPU_KERNEL_T(
Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
PushKernel, float);
MS_REG_CPU_KERNEL_T(Push,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeUInt64),
PushKernel, float);
} // namespace kernel
} // namespace mindspore

@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
sizes.push_back(SizeToInt(input->size) / sizeof(T));
}
parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
memcpy(outputs[0]->addr, &key_, sizeof(size_t));
memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
return true;
}

@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[10];
indices_size_ = indices_addr->size;
indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int);
}

@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[4];
indices_size_ = indices_addr->size;
indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int);
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save