rewrite ddim

test=develop
7 years ago · a500dfa579
parent e213050223
commit a500dfa579
30 changed files with 622 additions and 615 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -36,7 +36,7 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)

-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@ -15,34 +15,88 @@
 #pragma once

 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace framework {
+
 template <typename T, size_t N>
 class Array {
-  static_assert(N > 0, "The size of array must be larger than 0");
-
 public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;

-  HOSTDEVICE explicit Array(const T &val) {
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  HOSTDEVICE inline Array() = default;
+
+  template <typename... Args>
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
+    UnrollVarArgsAssign<T, N>::Run(data_, val, args...);
  }

-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
+    UnrollFillConstant<N>::Run(data_, val);
+  }

-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }

-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }

-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE inline const T &operator[](size_t index) const {
+    return data_[index];
+  }

  HOSTDEVICE constexpr size_t size() const { return N; }

+  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
+    return UnrollCompare<N>::Run(data_, other.data_);
+  }
+
+  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
+    return !(*this == other);
+  }
+
 private:
  T data_[N];
 };

+template <typename T>
+class Array<T, 0> {
+ public:
+  static constexpr size_t kSize = 0;
+
+  HOSTDEVICE inline Array() = default;
+
+  HOSTDEVICE inline void Fill(const T &val) {}
+
+  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
+
+  // Add constexpr to GetMutable() cause warning in MAC
+  HOSTDEVICE inline T *GetMutable() { return nullptr; }
+
+  HOSTDEVICE inline T &operator[](size_t index) {
+#ifndef __CUDA_ARCH__
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline const T &operator[](size_t index) const {
+#ifndef __CUDA_ARCH__
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE constexpr size_t size() const { return 0; }
+
+  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
+    return true;
+  }
+
+  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
+    return false;
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@ -18,8 +18,6 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"

 namespace paddle {
 namespace framework {
@ -29,51 +27,138 @@ namespace framework {
 *
 * The number of dimensions must be between [1, 9].
 */
-struct DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                         Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
+class DDim {
+ public:
+  constexpr static int kMaxRank = 9;

-  DDim() : var(Dim<1>()) {}
+  DDim() : rank_(1) { dim_[0] = 0; }
+
+  DDim(const int* d, int n);
+  DDim(const int64_t* d, int n);

  template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }

-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}

  template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
+  inline DDim& operator=(const Dim<D>& in) {
+    rank_ = D;
+    UnsafeCast<D>() = in;
    return *this;
  }

-  int64_t& operator[](int idx);
-  int64_t operator[](int idx) const;
+  inline int64_t& operator[](int idx) { return dim_[idx]; }

-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+
+  inline int64_t& at(int idx) {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_);
+    return dim_[idx];
  }

-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
+  inline int64_t at(int idx) const {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_);
+    return dim_[idx];
  }

-  DDimVar getVar() { return var; }
+  template <typename Visitor>
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor);
+
+  template <typename Visitor>
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) const;
+
+  bool operator==(const DDim& d) const;
+
+  bool operator!=(const DDim& d) const;
+
+  DDim operator+(const DDim& d) const;

-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;

-  bool operator!=(DDim d) const;
+  // Make DDim act like std::vector<int64_t>
+  using iterator = int64_t*;
+  using const_iterator = const int64_t*;

-  DDim operator+(DDim d) const;
+  int64_t* data() { return dim_.data(); }
+  const int64_t* data() const { return dim_.data(); }

-  DDim operator*(DDim d) const;
+  iterator begin() { return data(); }
+  const_iterator begin() const { return data(); }
+  iterator end() { return data() + rank_; }
+  const_iterator end() const { return data() + rank_; }
+
+  int size() const { return rank_; }
+
+ private:
+  template <int M>
+  inline Dim<M>& UnsafeCast() {
+    return const_cast<Dim<M>&>(const_cast<const DDim*>(this)->UnsafeCast<M>());
+  }

-  int size() const;
+  template <int M>
+  inline const Dim<M>& UnsafeCast() const {
+    static_assert(M >= 0 && M <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<M>*>(p);
+  }
+
+  friend DDim slice_ddim(const DDim& dim, int begin, int end);
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+
+  Dim<kMaxRank> dim_;
+  int rank_;
 };

+#define PADDLE_VISIT_DDIM(rank) \
+  case rank:                    \
+    return visitor(UnsafeCast<rank>())
+
+template <typename Visitor>
+typename std::result_of<Visitor(Dim<0>&)>::type DDim::apply_visitor(
+    Visitor&& visitor) {
+  switch (rank_) {
+    PADDLE_VISIT_DDIM(0);
+    PADDLE_VISIT_DDIM(1);
+    PADDLE_VISIT_DDIM(2);
+    PADDLE_VISIT_DDIM(3);
+    PADDLE_VISIT_DDIM(4);
+    PADDLE_VISIT_DDIM(5);
+    PADDLE_VISIT_DDIM(6);
+    PADDLE_VISIT_DDIM(7);
+    PADDLE_VISIT_DDIM(8);
+    PADDLE_VISIT_DDIM(9);
+    default:
+      PADDLE_THROW("Invalid rank %d", rank_);
+  }
+}
+
+template <typename Visitor>
+typename std::result_of<Visitor(const Dim<0>&)>::type DDim::apply_visitor(
+    Visitor&& visitor) const {
+  switch (rank_) {
+    PADDLE_VISIT_DDIM(0);
+    PADDLE_VISIT_DDIM(1);
+    PADDLE_VISIT_DDIM(2);
+    PADDLE_VISIT_DDIM(3);
+    PADDLE_VISIT_DDIM(4);
+    PADDLE_VISIT_DDIM(5);
+    PADDLE_VISIT_DDIM(6);
+    PADDLE_VISIT_DDIM(7);
+    PADDLE_VISIT_DDIM(8);
+    PADDLE_VISIT_DDIM(9);
+    default:
+      PADDLE_THROW("Invalid rank %d", rank_);
+  }
+}
+#undef PADDLE_VISIT_DDIM
+
 /**
 * \brief Make a DDim from std::vector<int64_t>
 *
@ -92,7 +177,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);

 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT

 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@ -129,12 +214,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
-
-namespace boost {
-
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@ -62,7 +62,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {

 struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPU;
    ctx.device_id = 0;
    return ctx;
@ -70,7 +70,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {

  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLGPU;
    ctx.device_id = place.device;
    return ctx;
@ -81,7 +81,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {

  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
    ctx.device_type = kDLCPUPinned;
    ctx.device_id = 0;
    return ctx;
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@ -38,7 +38,7 @@ class DLPackTensor {

  // The shape in DLTensor is defined as int64_t*
  // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[9];
+  ShapeType shape_[DDim::kMaxRank];
 };

 }  // namespace framework
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
@ -0,0 +1,169 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollFillConstant {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {
+    data[kStart] = val;
+    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollFillConstant<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAssign {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
+    d2[kStart] = static_cast<Tout>(d1[kStart]);
+    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAssign<kStart, kEnd, true> {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
+};
+
+template <typename T, size_t kStart, size_t kEnd, bool kStop>
+struct UnrollVarArgsAssign {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
+    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
+    d[kStart] = val;
+    UnrollVarArgsAssign<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d,
+                                                                      args...);
+  }
+};
+
+template <typename T, size_t kStart, size_t kEnd>
+struct UnrollVarArgsAssign<T, kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(T *d) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollCompare {
+  template <typename T>
+  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
+    return d1[kStart] == d2[kStart] &&
+           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollCompare<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
+    return true;
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAdd {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] + d2[kStart];
+    UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAdd<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollMul {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] * d2[kStart];
+    UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollMul<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollProduct {
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d) {
+    return d[kStart] *
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
+  }
+
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
+    return d1[kStart] * d2[kStart] +
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollProduct<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d) {
+    return 1;
+  }
+
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
+    return 0;
+  }
+};
+
+}  // namespace detail
+
+template <size_t N>
+using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
+
+template <typename T, size_t N>
+using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T, 0, N, N == 0>;
+
+template <size_t N>
+using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
+
+template <size_t N>
+using UnrollMul = detail::UnrollMul<0, N, N == 0>;
+
+template <size_t N>
+using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
    OpComment comment;
    PADDLE_ENFORCE(context->HasInput("X"),
                   "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-
    context->SetOutputDim("Out", context->GetInputDim("X"));
    context->ShareLoD("X", "Out");
  }
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
  }
  out->mutable_data<T>(out_dims, context.GetPlace());
  auto x_stride = framework::stride(x->dims());
-  auto out_stride = framework::stride(out->dims());
  auto offsets = GetOffsets(context);
  int64_t offset = 0;
  for (size_t i = 0; i < offsets.size(); ++i) {
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@ -378,7 +378,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
            ->GetMutable<CudnnRNNCache>();

    auto input_dims = input->dims();
-    auto weight_dims = weight->dims();
    auto init_h_dims = init_h->dims();
    auto init_c_dims = init_c->dims();
    in_grad->mutable_data<T>(ctx.GetPlace());
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
-                  framework::Dim<0> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
    auto place = dev_ctx.GetPlace();
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #ifdef PADDLE_WITH_CUDA
      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
      auto& cuda_ctx =
          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                   cuda_ctx.stream());
 #else
      PADDLE_THROW("Paddle is not compiled with GPU");
@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim[0]; ++i) {
      StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
-      src += src_stride.head;
-      dst += dst_stride.head;
+      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
+      src += src_stride[0];
+      dst += dst_stride[0];
    }
  }
 };

 template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+struct StridedCopyDimVisitor {
  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
                        const framework::DDim& src_stride,
                        const framework::DDim& dst_stride, T* dst)
@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
        dst_stride_(dst_stride),
        dst_(dst) {}

-  template <typename Dim>
-  void operator()(Dim dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
-    constexpr int dim = Dim::dimensions;
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  template <int D>
+  void operator()(const framework::Dim<D>& dst_dim) const {
+    StridedMemcpyFunctor<T, D> functor;
+    functor(dev_ctx_, src_, src_stride_.data(), dst_dim.data(),
+            dst_stride_.data(), dst_);
  }

  const platform::DeviceContext& dev_ctx_;
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");

    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
    auto im_info_dims = ctx->GetInputDim("ImInfo");

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Variances"),
                   "Input(Variances) shouldn't be null.");

-    auto scores_dims = ctx->GetInputDim("Scores");
-    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto anchors_dims = ctx->GetInputDim("Anchors");
-    auto variances_dims = ctx->GetInputDim("Variances");
-
    ctx->SetOutputDim("RpnRois", {-1, 4});
    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
  }
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {

    auto anchor_dims = ctx->GetInputDim("Anchor");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto im_info_dims = ctx->GetInputDim("ImInfo");
    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
                      "The rank of Input(Anchor) must be 2.");
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {

    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));

    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                      "Rank of first input must >= rank of second input.");
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
    auto* out0 = context.Output<Tensor>("Out");
    Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
    for (size_t i = 0; i < expand_times.size(); ++i) {
      bcast_dims[i] = expand_times[i];
    }
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@ -148,7 +148,6 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto w = ctx.Input<Tensor>("W");
    auto bias = ctx.Input<Tensor>("Bias");
    auto output = ctx.Output<Tensor>("Out");
-    auto in_dims = input->dims();
    auto w_dims = w->dims();
    auto out_dims = output->dims();
    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@ -242,15 +242,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
  bool use_peepholes = ctx.Attr<bool>("use_peepholes");

-#define INIT_BASE_SIZES                       \
-  auto ids_dims = ids->dims();   /* T x M*/   \
-  auto ids_numel = ids->numel(); /* T x 1*/   \
-  auto wh_dims = wh->dims();     /* D x 4D*/  \
-  const int D = wh_dims[0];                   \
-  const int D2 = D * 2;                       \
-  const int D3 = D * 3;                       \
-  int64_t row_number = embeddings->dims()[0]; \
-  int64_t row_width = embeddings->dims()[1];  \
+#define INIT_BASE_SIZES                                      \
+  auto ids_dims = ids->dims();                   /* T x M*/  \
+  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
+  auto wh_dims = wh->dims();                     /* D x 4D*/ \
+  const int D = wh_dims[0];                                  \
+  const int D2 = D * 2;                                      \
+  const int D3 = D * 3;                                      \
+  int64_t row_number = embeddings->dims()[0];                \
+  int64_t row_width = embeddings->dims()[1];                 \
  const int D4 = wh_dims[1];

 #define INIT_BASE_INPUT_DATAS                                        \
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
                   "Input(Logits@GRAD) should not be null.");

    auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));

    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
                   "Output(Predicted@GRAD) should not be null.");

    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);

--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  for (int i = 0; i < Rank; i++) {
    permute[i] = axis[i];
  }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
-
  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
  auto* dev = context.eigen_device();
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
  void operator()(const DeviceContext& context, const framework::Tensor* X,
                  framework::Tensor* Y) {
    auto in_dims = X->dims();
-    auto out_dims = Y->dims();
    const float* in_data = X->data<float>();
    float* out_data = Y->data<float>();
    const int kBatchDim = 0;
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
                   "Input(Out@Grad) must not be null.");

    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@ -146,12 +146,6 @@ class MulGradOp : public framework::OperatorWithKernel {
                   "Input(Out@GRAD) should not be null");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));

    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");
--- a/Show More
+++ b/Show More