!1241 update include header files 0311

From: @shenwei41 Reviewed-by: @lilongfei15,@xsmq Signed-off-by: @xsmq
4 years ago · c0f3dcb4f4
parent 8737b1843d 54a48678ae
commit c0f3dcb4f4
21 changed files with 1178 additions and 323 deletions
--- a/inc/external/acl/error_codes/ge_error_codes.h
+++ b/inc/external/acl/error_codes/ge_error_codes.h
@ -53,9 +53,9 @@ static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016;
 static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017;
 static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018;
 static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019;
-static const uint32_t ACL_ERROR_GE_TRANSSHAPE_FORMAT_INVALID = 145020;
-static const uint32_t ACL_ERROR_GE_TRANSSHAPE_SHAPE_INVALID = 145021;
-static const uint32_t ACL_ERROR_GE_TRANSSHAPE_DATATYPE_INVALID = 145022;
+static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020;
+static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021;
+static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022;
 static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000;
 static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001;
 static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000;
--- a/inc/external/acl/ops/acl_dvpp.h
+++ b/inc/external/acl/ops/acl_dvpp.h
--- a/scripts/format_source_code.sh
+++ b/scripts/format_source_code.sh
--- a/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
+++ b/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file avg_pool_1d_ops.h
+ * \brief
+ */
+#ifndef OPS_BUILT_IN_OP_PROTO_INC_AVGPOOL1DOPS_H_
+#define OPS_BUILT_IN_OP_PROTO_INC_AVGPOOL1DOPS_H_
+#include "graph/operator_reg.h"
+
+namespace ge {
+/**
+*@brief Generate an auxiliary matrix .  \n
+
+*@par Inputs:
+* @li x: A tensor. Must be one of the following types:uint8, int8,int16, int32,
+ int64, float16, float, double.The format must be NHWC NCHW NC1HWC0.
+
+*@par Attributes:
+*@li ksize: Kernel size. Input type is int.
+*@li strides: Input type is int.
+*@li pads: Input type is listInt .
+*@li ceil_mode: Bool, default value is false.
+*@li count_include_pad: Bool, default value is false.  \n
+
+*@par Outputs:
+*y_tensor: A  tensor with the same types as "x" .  \n
+*@par Third-party framework compatibility
+
+*Compatible with the TensorFlow operator Unbatch.
+*/
+REG_OP(AvgPool1DAvgMatrix)
+    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT8,
+                          DT_INT32, DT_INT64, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT8,
+                           DT_INT32, DT_INT64, DT_DOUBLE}))
+    .REQUIRED_ATTR(ksize, Int)
+    .REQUIRED_ATTR(strides, Int)
+    .REQUIRED_ATTR(pads, ListInt)
+    .ATTR(ceil_mode, Bool, false)
+    .ATTR(count_include_pad, Bool, false)
+    .OP_END_FACTORY_REG(AvgPool1DAvgMatrix)
+}
+#endif
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@ -2454,6 +2454,25 @@ REG_OP(Eltwise)
    .ATTR(coeff, ListFloat, {})
    .OP_END_FACTORY_REG(Eltwise)

+/**
+ *@brief Computes the inverse error function of each element of input. \n
+
+ *@par Inputs:
+ *One inputs, including:
+ * @li input_x: A tensor. Must be one of the following types:
+ *     float16, float32. \n
+
+ *@par Outputs:
+ *y: A Tensor with the same type and shape of input_x's. \n
+
+ *@par Third-party framework compatibility
+ *Compatible with the Pytorch operator Erfinv. \n
+ */
+REG_OP(Erfinv)
+    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .OUTPUT(output_y, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .OP_END_FACTORY_REG(Erfinv)
+
 /**
 *@brief Computes element-wise population count. \n

--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@ -1516,6 +1516,96 @@ REG_OP(DenseImageWarp)
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DenseImageWarp)

+/**
+*@brief Calculate the resize_d function. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li x: A tensor. Must be one of the following types:
+*     float16, float32. \n
+
+*@par Attributes:
+*@li sizes: An optional listInt. \n
+*@li scales: An optional listFloat.
+    Defaults to none. \n
+*@li roi: An optional listInt.
+    Defaults to none. \n
+*@li coordinate_transformation_mode: An optional String.
+    Defaults to "half_pixel". \n
+*@li cubic_coeff_a: An optional float.
+    Defaults to -0.75. \n
+*@li exclude_outside: An optional int.
+    Defaults to 0. \n
+*@li extrapolation_value: An optional float.
+    Defaults to 0.0. \n
+*@li mode: An optional String.
+    Defaults to "nearest". \n
+*@li nearest_mode: An optional String.
+    Defaults to "round_prefer_floor". \n
+
+*@par Outputs:
+*y: A Tensor with the same type of x's,
+    shape depends on x and sizes. \n
+*/
+REG_OP(ResizeD)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .REQUIRED_ATTR(sizes, ListInt)
+    .ATTR(scales, ListFloat, {})
+    .ATTR(roi, ListInt, {})
+    .ATTR(coordinate_transformation_mode, String, "half_pixel")
+    .ATTR(cubic_coeff_a, Float, -0.75)
+    .ATTR(exclude_outside, Int, 0)
+    .ATTR(extrapolation_value, Float, 0.0)
+    .ATTR(mode, String, "nearest")
+    .ATTR(nearest_mode, String, "round_prefer_floor")
+    .OP_END_FACTORY_REG(ResizeD)
+
+/**
+*@brief Calculate the resize_grad_d function. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li grads: A tensor. Must be one of the following types:
+*     float16, float32. \n
+
+*@par Attributes:
+*@li original_size: An optional listInt. \n
+*@li roi: An optional listInt.
+    Defaults to none. \n
+*@li scales: An optional listFloat.
+    Defaults to none. \n
+*@li coordinate_transformation_mode: An optional String.
+    Defaults to "half_pixel". \n
+*@li cubic_coeff_a: An optional float.
+    Defaults to -0.75. \n
+*@li exclude_outside: An optional int.
+    Defaults to 0. \n
+*@li extrapolation_value: An optional float.
+    Defaults to 0.0. \n
+*@li mode: An optional String.
+    Defaults to "nearest". \n
+*@li nearest_mode: An optional String.
+    Defaults to "round_prefer_floor". \n
+
+*@par Outputs:
+*y: A Tensor with the same type of x's,
+    shape depends on x and sizes. \n
+*/
+REG_OP(ResizeGradD)
+    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .REQUIRED_ATTR(original_size, ListInt)
+    .ATTR(roi, ListInt, {})
+    .ATTR(scales, ListFloat, {})
+    .ATTR(coordinate_transformation_mode, String, "half_pixel")
+    .ATTR(cubic_coeff_a, Float, -0.75)
+    .ATTR(exclude_outside, Int, 0)
+    .ATTR(extrapolation_value, Float, 0.0)
+    .ATTR(mode, String, "nearest")
+    .ATTR(nearest_mode, String, "round_prefer_floor")
+    .OP_END_FACTORY_REG(ResizeGradD)
+
 /**
 *@brief Computes the gradients of DenseImageWarp with respect to image and flow. \n

@ -1535,5 +1625,81 @@ REG_OP(DenseImageWarpGrad)
    .OUTPUT(grad_image, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(grad_flow, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DenseImageWarpGrad)
+
+/**
+*@brief This operation samples input X by using interpolation based on flow field grid,
+ which is usually gennerated by affine_grid. The grid of shape [N, H, W, 2] is the concatenation of
+ (x, y) coordinates with shape [N, H, W] each, where x is indexing the 4th dimension (in width dimension) of
+ input data x and y is indexng the 3rd dimention (in height dimension), finally results is
+ the interpolation value of 4 nearest corner points. The output tensor shape will be [N, C, H, W].
+
+*@par Inputs:
+*@li x: 4-D Tensor with shape `[batch, channels, height, width]`.
+*@li grid: flow field grid, 4-D Tensor with shape `[batch, height, width, 2]`.
+
+*@par Attributes:
+*@li interpolation_mode: An optional string specifying the interpolation method. Only 'bilinear' is
+ supported for now .
+*@li padding_mode: An optional string specifying the pad method. Only 'zeros' is supported for now .
+*@li align_corners: An optional bool. If "true", the centers of the corner
+ pixels of the input and output tensors are aligned. Defaults to "false" .
+
+*@par Outputs:
+*y: Returns 4-D Tensor with the same dtype as `X`.
+
+*@par Third-party framework compatibility
+*Compatible with pytorch GridSampler2D operator.
+*/
+REG_OP(GridSampler2D)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(interpolation_mode, String, "bilinear")
+    .ATTR(padding_mode, String, "zeros")
+    .ATTR(align_corners, Bool, false)
+    .OP_END_FACTORY_REG(GridSampler2D)
+
+/**
+*@brief This operation unnormalize input Grid, which is usually gennerated by affine_grid.
+
+*@par Inputs:
+*@li grid: flow field grid, 4-D Tensor with shape `[batch, height, width, 2]`.
+*@li assist: Assist matrix, a 4-D tensor of type float16.
+
+*@par Attributes:
+*@li align_corners: An optional bool. If "true", the centers of the corner
+ pixels of the input and output tensors are aligned. Defaults to "false" .
+
+*@par Outputs:
+*diff: Returns 4-D Tensor with the same shape and dtype as `grid`.
+*position: Returns 4-D Tensor with the same shape as `grid`.
+*/
+REG_OP(GridUnnormal)
+    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(assist, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(diff, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(position, TensorType({DT_INT32}))
+    .ATTR(align_corners, Bool, false)
+    .OP_END_FACTORY_REG(GridUnnormal)
+
+/**
+*@brief This operation unfold input X based on unnormalized grid, which is gennerated by GridUnnormal.
+
+*@par Inputs:
+*@li x: 4-D Tensor with shape `[batch, channels, height, width]`.
+*@li position: 4-D Tensor with shape `[batch, output_height, output_width, 2]`.
+
+*@par Attributes:
+*@li padding_mode: An optional string specifying the pad method. Only 'zeros' is supported for now .
+
+*@par Outputs:
+*y: Returns 4-D Tensor with the same dtype as `x`.
+*/
+REG_OP(ImageUnfold)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(position, TensorType({DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(padding_mode, String, "zeros")
+    .OP_END_FACTORY_REG(ImageUnfold)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/list_ops.h
+++ b/third_party/fwkacllib/inc/ops/list_ops.h
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@ -982,6 +982,65 @@ REG_OP(SoftMarginLossGrad)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SoftMarginLossGrad)

+/**
+ *@brief Computes batched the p-norm distance between each pair of
+ *the two collections of row vectors. \n
+
+ *@par Inputs:
+ *Two inputs, including:
+ * @li x1: A tensor with shpae: BxPXM. Must be one of the following types:
+ *     float16, float32. \n
+ * @li x2: A tensor with shpae: BxRxM. Must be one of the following types:
+ *     float16, float32. \n
+
+ *@par Attributes:
+ * @li p: An optional float >= 0 or inf. Defaults to 2.0. \n
+
+ *@par Outputs:
+ * y: A Tensor with the same type of x1's and with shape BxPxR. \n
+
+ *@par Third-party framework compatibility
+ *Compatible with the Pytorch operator Cdist. \n
+ */
+REG_OP(Cdist)
+    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(p, Float, 2.0)
+    .OP_END_FACTORY_REG(Cdist)
+
+/**
+*@brief  Computes the grad of x1 in cdist. \n
+
+*@par Inputs:
+*Four inputs, including:
+ * @li grad: Grad with shape BxPxR. Must be one of the following types:
+*     float16, float32. \n
+* @li x1: A tensor with shpae: BxPXM. Must be one of the following types:
+*     float16, float32. \n
+* @li x2: A tensor with shpae: BxRxM. Must be one of the following types:
+*     float16, float32. \n
+* @li cdist: Output tensor of cdist forward with shpae: BxPXR.
+*     Must be one of the following types: float16, float32. \n
+
+*@par Attributes:
+* @li p: An optional float >= 0 or inf. Defaults to 2.0. \n
+
+*@par Outputs:
+* y: A Tensor with the same type and shape of x1's. \n
+
+*@par Third-party framework compatibility
+*Compatible with the Pytorch operator Cdist Backward. \n
+*/
+REG_OP(CdistGrad)
+    .INPUT(grad, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(x1, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(x2, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(cdist, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .ATTR(p, Float, 2.0)
+    .OP_END_FACTORY_REG(CdistGrad)
+
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_MATH_OPS_H_
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@ -1065,7 +1065,37 @@ REG_OP(Tril)
    .ATTR(diagonal, Int, 0)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(Tril)
+/**
+*@brief Concatenates a list of N tensors along the first dimension.
+*@par Inputs:
+* Two inputs, including:
+* @li values: A list of Tensors. Must be one of the following types:  int32, float16, float32.
+*     Tensors to be concatenated. All must have size 1 in the first dimension and same shape.
+*     It's a dynamic input.
+* @li shape: A Tensor of the same type as "x".
+* The final shape of the result. Should be equal to the shapes of any input
+* but with the number of input values in the first dimension . \n

+*@par Attributes:
+*equation: The subscripts for the Einstein summation. \n
+*tensor_size: tensor size of input \n
+
+*@par Outputs:
+*@li y: Sums the product of the elements of the input operands along dimensions specified
+ using a notation based on the Einstein summation convention. \n
+
+*@attention Constraints:
+*Input tensor_size must be Int. \n
+
+*@par Third-party framework compatibility
+*Compatible with Pytorch einsum operator.
+*/
+REG_OP(EinSum)
+    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
+    .REQUIRED_ATTR(equation, String)
+    .REQUIRED_ATTR(tensor_size, Int)
+    .OP_END_FACTORY_REG(EinSum)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_MATRIX_CALCULATION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@ -427,6 +427,33 @@ REG_OP(MVN)
    .ATTR(eps, Float, 1e-9)
    .OP_END_FACTORY_REG(MVN)

+/**
+*@brief Normalizes the input . \n
+
+*@par Inputs:
+* One input:
+*x: An NCHW tensor of type float16 or float32 . \n
+
+*@par Attributes:
+*@li eps: An optional float32 epsilon for not dividing by zero. Defaults to "1e-9" . \n
+*@li axes: A list of Intefers, along which axis to reduce. Defaults to "[0, 2, 3]" . \n
+
+*@par Outputs:
+*y: An NCHW tensor of type float16 or float32 . \n
+
+*@attention Constraints:
+* The input tensor must have the NCHW format, whose shape length must be 4.
+*@par Third-party framework compatibility
+* Compatible with the ONNX operator MeanVarianceNormalization.
+*/
+
+REG_OP(MVNV2)
+    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) /* "First operand." */
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))  /* "Result, has same element type as inputs" */
+    .ATTR(eps, Float, 1e-9)
+    .ATTR(axes, ListInt, {0, 2, 3})
+    .OP_END_FACTORY_REG(MVNV2)
+
 /**
 *@brief Normalizes the input "x1" . \n

@ -1205,6 +1232,33 @@ REG_OP(Centralization)
    .ATTR(axes, ListInt, {-1})
    .OP_END_FACTORY_REG(Centralization)

+/**
+*@brief Roll the tensor along the given dimension(s).
+* Elements that are shifted beyond the last position are re-introduced at the first position.
+* If a dimension is not specified, the tensor will be flattened before rolling and then restored to the original shape. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li x: A tensor . Must be one of the following types:
+*     float16, float32, int32, uint32, int8, uint8. \n
+
+*@par Attributes:
+* @li shifts: The number of places by which the elements of the tensor are shifted. \n
+* @li dims: Axis along which to roll. \n
+
+*@par Outputs:
+* y: A Tensor with the same type and shape of x's. \n
+
+*@par Third-party framework compatibility
+*Compatible with the Pytorch operator Roll. \n
+*/
+REG_OP(Roll)
+    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_UINT32,DT_INT8,DT_UINT8}))
+    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_UINT32,DT_INT8,DT_UINT8}))
+    .REQUIRED_ATTR(shifts, ListInt)
+    .ATTR(dims, ListInt, {})
+    .OP_END_FACTORY_REG(Roll)
+
 /**
 *@brief Calculate the loss. Creates a criterion that optimizes a two-class classification
 logistic loss between input_x and input_y (containing 1 or -1). \n
--- a/third_party/fwkacllib/inc/ops/nn_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_ops.h
@ -49,5 +49,60 @@ REG_OP(InTopKV2)
    .INPUT(k, TensorType({IndexNumberType}))
    .OUTPUT(precision, TensorType({DT_BOOL}))
    .OP_END_FACTORY_REG(InTopKV2)
+
+/**
+*@brief Performs batch normalization . \n
+
+*@par Inputs:
+* Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the scaling factor.
+*@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the offset.
+*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
+operation is used for training.
+*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
+5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
+if the operation is used for training . \n
+
+*@par Attributes:
+*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
+*@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
+*@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n
+
+*@par Outputs:
+* Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
+*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the mean of "x".
+*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
+*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
+*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n
+
+*@attention Constraints:
+*@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
+then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance".
+*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
+*/
+REG_OP(FusedBatchNormV2)
+    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(scale, TensorType({DT_FLOAT}))
+    .INPUT(offset, TensorType({DT_FLOAT}))
+    .OPTIONAL_INPUT(mean, TensorType({DT_FLOAT}))
+    .OPTIONAL_INPUT(variance, TensorType({DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(batch_mean, TensorType({DT_FLOAT}))
+    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
+    .OUTPUT(reserve_space_1, TensorType({DT_FLOAT}))
+    .OUTPUT(reserve_space_2, TensorType({DT_FLOAT}))
+    .ATTR(epsilon, Float, 0.0001)
+    .ATTR(data_format, String, "NHWC")
+    .ATTR(is_training, Bool, true)
+    .OP_END_FACTORY_REG(FusedBatchNormV2)
 }// namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@ -1502,14 +1502,14 @@ REG_OP(AdaptiveAvgPool2d)
 * @brief Compute gradients of adaptive averagev2 pooling function.

 * @par Inputs:
-* @li input_grad: A NCHW Tensor. Must be one of the following data types:
+* @li input_grad: A Tensor. Must be one of the following data types:
 * float16, float32.

 * @par Attributes:
 * @li orig_input_shape: A required tuple or list of type int32.

 * @par Outputs:
-* @li output_grad: A tensor with the same shape and type as "orig_input_shape".
+* @li output_grad: A tensor with the same type as "input_grad".

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator AdaptiveAvgPool2dGrad.
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@ -530,6 +530,34 @@ REG_OP(Elu)
    .ATTR(alpha, Float, 1.0)
    .OP_END_FACTORY_REG(Elu)

+/**
+*@brief Continuously Differentiable Exponential Linear Uints:
+*       Perform the linear uint element-wise on the input tensor X using formula:
+*       max(0, x) + min(0, alpha * (exp(x/alpha) - 1)). \n
+
+*@par Inputs:
+*x: A float16, float32 or double, for the input data type . \n
+
+*@par Attributes:
+*alpha: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n
+
+*@par Outputs:
+*y: A float16, float32 or double, for the normalized result . \n
+
+*@attention Constraints:
+*@li The input is of type float16 or float32 . \n
+
+*@par Multiple batches supported or not
+*Supported
+*@par Third-party framework compatibility
+*@li Compatible with ONNX's Celu operator
+*/
+REG_OP(Celu)
+    .INPUT(x, TensorType::FloatingDataType())
+    .OUTPUT(y, TensorType::FloatingDataType())
+    .ATTR(alpha, Float, 1.0)
+    .OP_END_FACTORY_REG(Celu)
+
 /**
 *@brief Computes gradients for the exponential linear (Elu) operation.
 *
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@ -101,7 +101,7 @@ REG_OP(FillD)
 */
 REG_OP(BroadcastTo)
    .INPUT(x, TensorType::BasicType())
-    .INPUT(shape, TensorType({DT_INT32}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(BroadcastTo)

--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@ -239,6 +239,30 @@ REG_OP(GatherV2D)
    .REQUIRED_ATTR(axis, Int)
    .OP_END_FACTORY_REG(GatherV2D)

+/**
+*@Gathers values along an axis specified by dim . \n
+
+*@par Inputs:
+*@li x: A Tensor. Must be one of the following types: float16, float32, int32, int64.
+*@li index: A Tensor. Must be one of the following types: int64 . \n
+
+*@par Attributes:
+* dim: the axis along which to index . \n
+
+*@par Outputs:
+* y: A Tensor. Has the same type as "x" . \n
+
+*@par Third-party framework compatibility
+*Compatible with the PyTorch operator Gather.
+*/
+
+REG_OP(GatherElements)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT64}))
+    .INPUT(index, TensorType({DT_INT64}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT64}))
+    .ATTR(dim, Int, 0)
+    .OP_END_FACTORY_REG(GatherElements)
+
 /**
 *@brief Extracts a strided slice of a tensor. Roughly speaking, this op
    extracts a slice of size (end-begin)/stride from the given input tensor.
@ -486,6 +510,38 @@ REG_OP(UnsortedSegmentSum)
    .OUTPUT(y, TensorType::NumberType())
    .OP_END_FACTORY_REG(UnsortedSegmentSum)

+/**
+*@brief Creates a one-dimensional tensor of size steps whose values are evenly spaced from start to 
+*	end, inclusive, on a logarithmic scale with base base. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li assist: A tensor. Must be one of the following types:
+*     float16, float32. \n
+
+* @par Attributes:
+* @li start: An required float. Used to select the start. \n
+* @li end: An required float. Used to select the end. \n
+* @li steps: An optional int.Defaults to 100. \n
+* @li base: An optional float.Defaults to 10.0. \n
+* @li dtype: An optional int.Defaults to 1. \n
+
+*@par Outputs:
+*y: A Tensor with the same type and shape of input_x's. \n
+
+*@par Third-party framework compatibility
+*Compatible with the Pytorch operator logspaced. \n
+*/
+REG_OP(LogSpaceD)
+    .INPUT(assist, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .REQUIRED_ATTR (start, Float)
+    .REQUIRED_ATTR (end, Float)
+    .ATTR(steps, Int, 100)
+    .ATTR(base, Float, 10.0)
+    .ATTR(dtype, Int, 1)
+    .OP_END_FACTORY_REG(LogSpaceD)
+
 /**
 *@brief Computes the sum along segments of a tensor . \n

--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@ -339,7 +339,7 @@ RTS_API rtError_t rtLabelCreateEx(rtLabel_t *label, rtStream_t stream);
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
-rtError_t rtLabelCreateExV2(rtLabel_t *label, rtModel_t model, rtStream_t stream);
+RTS_API rtError_t rtLabelCreateExV2(rtLabel_t *label, rtModel_t model, rtStream_t stream);

 /**
 * @ingroup dvrt_base
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@ -132,6 +132,11 @@ typedef struct tagRtPlatformConfig {
    uint32_t platformConfig;
 } rtPlatformConfig_t;

+typedef enum tagRTTaskTimeoutType {
+    RT_TIMEOUT_TYPE_OP_WAIT = 0,
+    RT_TIMEOUT_TYPE_OP_EXECUTE,
+} rtTaskTimeoutType_t;
+
 /**
 * @ingroup
 * @brief get AI core count
@ -203,6 +208,24 @@ RTS_API rtError_t rtGetRuntimeVersion(uint32_t *runtimeVersion);
 */
 RTS_API rtError_t rtGetDeviceCapability(int32_t deviceId, int32_t moduleType, int32_t featureType, int32_t *value);

+/**
+ * @ingroup
+ * @brief set event wait task timeout time.
+ * @param [in] timeout
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtSetOpWaitTimeOut(uint32_t timeout);
+
+/**
+ * @ingroup
+ * @brief set op execute task timeout time.
+ * @param [in] timeout
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtSetOpExecuteTimeOut(uint32_t timeout);
+
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@ -188,7 +188,7 @@ typedef void (*rtCallback_t)(void *fnData);
 /**
 * @ingroup rt_kernel
 * @brief kernel mode
- */
+**/
 #define RT_DEFAULT_KERNEL_MODE (0x00)
 #define RT_NORMAL_KERNEL_MODE (0x01)
 #define RT_ALL_KERNEL_MODE (0x02)
@ -211,7 +211,7 @@ RTS_API rtError_t rtDevBinaryRegister(const rtDevBinary_t *bin, void **handle);

 /**
 * @ingroup rt_kernel
- * @brief register device binary
+ * @brief register device binary with all kernel
 * @param [in] bin   device binary description
 * @param [out] handle   device binary handle
 * @return RT_ERROR_NONE for ok
@ -330,7 +330,7 @@ RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *
 * @ingroup rt_kernel
 * @brief launch kernel with handle to device
 * @param [in] handle   program
- * @param [in] devFunc    device function description
+ * @param [in] devFunc   device function description.
 * @param [in] blockDim   block dimentions
 * @param [in] args   argments address for kernel function
 * @param [in] argsSize   argements size
@ -341,7 +341,7 @@ RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, uint32_t argsSize,
-                                           rtSmDesc_t *smDesc, rtStream_t stream, const void *kernelInfo);
+                                            rtSmDesc_t *smDesc, rtStream_t stream_, const void *kernelInfo);

 /**
 * @ingroup rt_kernel
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@ -133,12 +133,13 @@ typedef struct tagAllKernelTaskInfo {
    uint16_t argsCount;
    uint16_t argsSize;
    uint16_t reserved;
-    const void *dev_func;
+    void *devfunc;
    void *handle;
    uint8_t *smDesc;
    uint8_t *args;
    uint16_t *argsOffset;
 } rtAllKernelTaskInfo_t;
+
 typedef struct tagKernelTaskInfoEx {
    uint32_t flags;
    uint32_t argsSize;
@ -263,7 +264,7 @@ typedef struct tagTaskInfo {
    union {
        rtKernelTaskInfoEx_t kernelTaskEx;
        rtKernelTaskInfo_t kernelTask;
-        rtAllKernelTaskInfo_t allkernelTask;
+        rtAllKernelTaskInfo_t allKernelTask;
        rtEventTaskInfo_t eventTask;
        rtStreamSwitchTaskInfo_t streamSwitchTask;
        rtStreamActiveTaskInfo_t streamActiveTask;
@ -285,10 +286,27 @@ typedef struct tagTaskInfo {
    } u;
 } rtTaskInfo_t;

+typedef struct tagNodeInfo_t {
+    uint32_t nodeIdx;
+    uint32_t reserved[1];
+} rtNodeInfo;
+
+typedef struct tagHwtsInfo_t {
+    uint16_t taskId;
+    uint16_t sqExeHead;
+    uint16_t streamExeHead;
+    uint16_t reserved[2];
+} rtHwtsInfo;
+
 typedef struct tagLabelDevInfo_t {
    uint16_t modelId;
    uint16_t streamId;
    uint16_t labelId;
+    union {
+        rtNodeInfo nodeInfo;
+        rtHwtsInfo hwtsInfo;
+        uint16_t reserved[5];
+    }u;
 }rtLabelDevInfo;

 typedef rtError_t (*rtTaskGenCallback)(rtModel_t model, rtTaskInfo_t *taskInfo);
--- a/third_party/fwkacllib/inc/runtime/stream.h
+++ b/third_party/fwkacllib/inc/runtime/stream.h
@ -189,6 +189,28 @@ RTS_API rtError_t rtStreamActive(rtStream_t activeStream, rtStream_t stream);
 */
 RTS_API rtError_t rtStreamSwitchN(void *ptr, uint32_t size, void *valuePtr, rtStream_t *trueStreamPtr,
                                  uint32_t elementSize, rtStream_t stream, rtSwitchDataType_t dataType);
+
+/*
+ * @ingroup dvrt_stream
+ * @brief enable debug for dump overflow exception with stream
+ * @param [in] addr: ddr address of kernel exception dumpped
+ * @param [in] stream: stream handle
+ * @param [in] flag: debug flag
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtDebugRegisterForStream(rtStream_t stream, uint32_t flag, const void *addr,
+                                   uint32_t *streamId, uint32_t *taskId);
+
+/*
+ * @ingroup rt_model
+ * @brief disable debug for dump overflow exception with stream
+ * @param [in] stream: stream handle
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtDebugUnRegisterForStream(rtStream_t stream);
+
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
--- a/third_party/fwkacllib/inc/toolchain/tuning_tool/tune_api.h
+++ b/third_party/fwkacllib/inc/toolchain/tuning_tool/tune_api.h
@ -11,93 +11,11 @@
 /** @defgroup aoe aoe调优接口 */
 #ifndef TUNE_API_H
 #define TUNE_API_H
-#include <vector>
 #include <map>
 #include <string>
-#include "graph/graph.h"
 #include "ge/ge_api.h"
 #include "aoe_types.h"

-/**
- * @ingroup aoe
- *
- * aoe status
- */
-enum MsTuneStatus {
-    MSTUNE_SUCCESS,  /** tune success */
-    MSTUNE_FAILED,   /** tune failed */
-};
-
-// Option key: for train options sets
-const std::string MSTUNE_SELF_KEY = "mstune";
-const std::string MSTUNE_GEINIT_KEY = "initialize";
-const std::string MSTUNE_GESESS_KEY = "session";
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct RunnerInitConfig {
-    // onilne online
-    std::string profPath;
-    std::string parserPath;
-    // ncs only
-    std::vector<uint32_t> devList;
-};
-
-struct RunnerOpInfo {
-    std::string opName;
-    uint64_t opCostTime;
-    uint64_t aicoreCostTime;
-    // gradient_split only
-    std::string modelName;
-    std::string opType;
-    std::vector<uint64_t> start;
-    std::vector<uint64_t> end;
-};
-
-struct RunnerModelInfo {
-    uint64_t totalCostTime;
-};
-
-struct RunnerRunResult {
-    std::vector<RunnerModelInfo> modelInfo;
-    std::vector<RunnerOpInfo> opInfo;
-};
-
-struct RunnerResult {
-    uint64_t totalCostTime;
-    std::map<std::string, uint64_t> opCostTime;
-    std::map<std::string, uint64_t> aicoreCostTime;
-};
-
-struct RunnerDataBuf {
-    void *ptr = nullptr;
-    size_t size = 0;
-};
-
-struct AOEBufferData {
-    std::shared_ptr<uint8_t> data = nullptr;
-    uint64_t length;
-};
-
-struct RunnerConfig {
-    bool isProf;
-    uint32_t loop;
-    // offline only
-    std::vector<RunnerDataBuf> input;
-    std::vector<RunnerDataBuf> output;
-    std::string modelPath;
-    RunnerDataBuf modelData;
-    // online only
-    uint32_t devId;
-    std::vector<std::vector<ge::Tensor>> inputs;
-    std::vector<ge::Graph> dependGraph; // run graph (for training)
-};
-#ifdef __cplusplus
-}
-#endif
-
 /**
 * @ingroup aoe
 * @par 描述: 命令行调优