synchronize latest Ascend software suite 18 Jul 2020, and merging branches

5 years ago · 859acc6d2a
parent bb9f715c10 43567f9b9f
commit 859acc6d2a
414 changed files with 10399 additions and 2019 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -15,4 +15,4 @@
 	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
-	url = https://gitee.com/ms-incubator/graphengine.git
+	url = https://gitee.com/mindspore/graphengine.git
--- a/README.md
+++ b/README.md
@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm

 ### Communication

- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
+- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
 - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
- Video Conferencing: https://meet.jit.si
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
+- Video Conferencing: TBD
+- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>

 ## Contributing

--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
+Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625
--- a/build.sh
+++ b/build.sh
@ -24,7 +24,7 @@ usage()
 {
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
-  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
+  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
  echo ""
  echo "Options:"
@ -48,7 +48,6 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump memory, default off"
  echo "    -D Enable dumping of function graph ir, default on"
-  echo "    -S Enable async data dump, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@ -89,7 +88,6 @@ checkopts()
  ENABLE_TIMELINE="off"
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
-  ENABLE_DATA_DUMP="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="off"
@ -104,7 +102,7 @@ checkopts()
  ENABLE_PYTHON="on"

  # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@ -186,6 +184,7 @@ checkopts()
        elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
          ENABLE_D="on"
          ENABLE_CPU="on"
+          ENABLE_SERVING="on"
        elif [[ "X$OPTARG" == "Xcpu" ]]; then
          ENABLE_CPU="on"
        else
@ -220,11 +219,6 @@ checkopts()
        ENABLE_DUMPE2E="$OPTARG"
        echo "enable dump end to end"
        ;;
-      S)
-        check_on_off $OPTARG S
-        ENABLE_DATA_DUMP="$OPTARG"
-        echo "enable data dump"
-        ;;
      D)
        check_on_off $OPTARG D
        ENABLE_DUMP_IR="$OPTARG"
@ -328,9 +322,6 @@ build_mindspore()
    if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
    fi
-    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
-        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
-    fi
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
    if [[ "X$ENABLE_MPI" = "Xon" ]]; then
--- a/cmake/external_libs/glog.cmake
+++ b/cmake/external_libs/glog.cmake
@ -1,4 +1,4 @@
-set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
+set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(glog
        VER 0.4.0
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
    add_compile_definitions(ENABLE_DUMP_E2E)
 endif()

-if(ENABLE_DATA_DUMP)
-    add_compile_definitions(ENABLE_DATA_DUMP)
-endif()
-
 if(ENABLE_DEBUGGER)
    add_compile_definitions(ENABLE_DEBUGGER)
 endif()
+
+if(ENABLE_TESTCASES)
+    add_compile_definitions(ENABLE_TESTCASES)
+endif()
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@ -1,13 +1,16 @@
 # find exec
 find_package(Python3 3.7 COMPONENTS Interpreter Development)
 if (NOT Python3_FOUND)
-    message("No python3 found.")
-    return ()
+    message(FATAL_ERROR "No python3 found.")
 endif ()

 set(PYTHON ${Python3_EXECUTABLE})
 set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})

+if (NOT PYTHON_VERSION MATCHES "3.7")
+    message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
+endif ()
+
 find_package(Git)
 if (NOT GIT_FOUND)
    message("No git found.")
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
+Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009
--- a/hub/docs/.gitkeep
+++ b/hub/docs/.gitkeep
--- a/hub/images/.gitkeep
+++ b/hub/images/.gitkeep
--- a/hub/scripts/.gitkeep
+++ b/hub/scripts/.gitkeep
--- a/mindspore/_extends/parse/resources.py
+++ b/mindspore/_extends/parse/resources.py
@ -17,7 +17,7 @@
 """Resources for ast tree parse."""
 import ast
 import math
-from mindspore import IndexedSlices
+from mindspore import IndexedSlices, SparseTensor
 from mindspore.ops.composite import multitype_ops
 from mindspore.ops import functional as F, composite as C
 from . import standard_method as M
@ -140,4 +140,5 @@ convert_object_map = {

    # user defined
    IndexedSlices:  F.make_indexed_slices,
+    SparseTensor:   F.make_sparse_tensor,
 }
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -44,7 +44,7 @@ if(ENABLE_GPU)
            "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
            )

-    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
+    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
                                  "runtime/device/gpu/distribution/collective_wrapper.cc"
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -26,14 +26,6 @@ if (ENABLE_CPU)
        "cpu/*.cc"
    )

-    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" 
-                                  "cpu/ps/pull_kernel.cc"
-                                  "cpu/ps/embedding_look_up_ps_kernel.cc"
-                                  "cpu/ps/embedding_look_up_proxy_kernel.cc"
-                                  "cpu/ps/apply_momentum_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_adam_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
-
    if (NOT ENABLE_MPI)
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@ -41,6 +33,17 @@ if (ENABLE_CPU)
    endif ()
 endif ()

+if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
+endif()
+
 if (ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@ -18,6 +18,7 @@
 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"
+#include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {

  std::string dst_type;
  TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
-  if (output_type == kFloat32->type_id()) {
-    dst_type = "float32";
-  } else if (output_type == kFloat16->type_id()) {
-    dst_type = "float16";
-  } else if (output_type == kInt32->type_id()) {
-    dst_type = "int32";
-  } else {
-    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
-  }
+  dst_type = TypeId2String(output_type);
  AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@ -21,9 +21,7 @@
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "backend/kernel_compiler/kernel.h"
-#ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
-#endif

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
  uint32_t stream_id() { return stream_id_; }
-  virtual bool NeedDump() {
-#ifdef ENABLE_DATA_DUMP
-    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
-#else
-    return false;
-#endif
-  }
+  virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }

 protected:
  uint32_t block_dim_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@ -73,9 +73,18 @@ class KernelMeta {
 };

 struct SparseGradient {
-  float *value_;
-  int *indices_;
-  size_t indices_size_;
+  float *value_{nullptr};
+  int *indices_{nullptr};
+  size_t indices_size_{0};
+};
+
+struct ReduceSparseGradientParam {
+  SparseGradient *input_grad_{nullptr};
+  SparseGradient *workspace_grad_{nullptr};
+  SparseGradient *output_grad_{nullptr};
+  size_t max_index_{0};
+  size_t value_stride_{0};
+  bool use_sort_reduce_{false};
 };

 struct MultiThreadComputeParams {
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
-void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                              size_t outer_dim);
-void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                          size_t outer_dim, bool use_multi_threads = true);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
 std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
                                                                            const std::vector<AnfNodePtr> &input_list);
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
 bool IsWeightBoundary(const AnfNodePtr &node);
 void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
                        size_t total_compute_size);
-void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
-                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
-                                        std::vector<size_t> *slice_positions);
-void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
-                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
-                               size_t outer_dim);
-void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
-                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
+void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

- private:
+ protected:
  void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                   float **output_addr);
  void CheckParam(const CNodePtr &kernel_node);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
  size_t output_size = outputs[0]->size;

  size_t size = input_size / sizeof(float);
-  ::ps::SArray<float> lookup_ids(size, 0);
+  ::ps::SArray<int> lookup_ids(size, 0);
  ::ps::SArray<int> lengths{size};
-  ::ps::SArray<float> lookup_result;
+  ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);

  auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
  }
-  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
+  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
                                                                 parallel::ps::kEmbeddingLookupCmd);

  auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
  split_num_ = pserver_num_;

  // input shape should be sharded after computing offset_;
-  Shard(input_shape_, axis_);
+  Shard(&input_shape_, axis_);

  size_t output_size =
    std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
 MS_REG_CPU_KERNEL_T(
  Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
  PushKernel, float);
+
+MS_REG_CPU_KERNEL_T(Push,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddOutputAttr(kNumberTypeUInt64),
+                    PushKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
      sizes.push_back(SizeToInt(input->size) / sizeof(T));
    }
    parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
-    memcpy(outputs[0]->addr, &key_, sizeof(size_t));
+    memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
    return true;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[10];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[4];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/Show More
+++ b/Show More