diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index c1ccbccc0d..f76743febf 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -192,10 +192,6 @@ class RMSELoss(_Loss):
     Outputs:
         Tensor, weighted loss float tensor.
 
-    Raises:
-        ValueError: If `reduction` is not one of 'none', 'mean', 'sum'.
-        ValueError: If the dimensions are different.
-
     Supported Platforms:
         ``Ascend`` ``GPU``
 
@@ -212,7 +208,6 @@ class RMSELoss(_Loss):
         self.MSELoss = MSELoss()
 
     def construct(self, logits, label):
-        _check_shape(logits.shape, label.shape)
         rmse_loss = F.sqrt(self.MSELoss(logits, label))
 
         return rmse_loss
@@ -482,16 +477,17 @@ class MultiClassDiceLoss(_Loss):
     obtained through the binary loss of each category, and then the average value.
 
     Args:
-        weights (Union[Tensor, None]): Tensor of shape `[num_classes, dim]`.
+        weights (Union[Tensor, None]): Tensor of shape `[num_classes, dim]`. The weight shape[0] should be equal to
+            y shape[1].
         ignore_indiex (Union[int, None]): Class index to ignore.
         activation (Union[str, Cell]): Activate function applied to the output of the fully connected layer, eg. 'ReLU'.
             Default: 'softmax'. Choose from: ['softmax', 'logsoftmax', 'relu', 'relu6', 'tanh','Sigmoid']
 
     Inputs:
         - **y_pred** (Tensor) - Tensor of shape (N, C, ...). y_pred dimension should be greater than 1.
-                                The data type must be float16 or float32.
+            The data type must be float16 or float32.
         - **y** (Tensor) - Tensor of shape (N, C, ...). y dimension should be greater than 1.
-                           The data type must be float16 or float32.
+            The data type must be float16 or float32.
 
     Outputs:
         Tensor, a tensor of shape with the per-example sampled MultiClass Dice Losses.
@@ -533,6 +529,7 @@ class MultiClassDiceLoss(_Loss):
         self.reshape = P.Reshape()
 
     def construct(self, logits, label):
+        _check_shape(logits.shape, label.shape)
         _check_ndim_multi(logits.ndim, label.ndim)
         total_loss = 0
 
diff --git a/mindspore/nn/metrics/bleu_score.py b/mindspore/nn/metrics/bleu_score.py
index 5a4a73474f..7b76c12b39 100644
--- a/mindspore/nn/metrics/bleu_score.py
+++ b/mindspore/nn/metrics/bleu_score.py
@@ -83,7 +83,7 @@ class BleuScore(Metric):
         Updates the internal evaluation result with `candidate_corpus` and `reference_corpus`.
 
         Args:
-            inputs: Input `candidate_corpus` and ``reference_corpus`. `candidate_corpus` and `reference_corpus` are a
+            inputs: Input `candidate_corpus` and `reference_corpus`. `candidate_corpus` and `reference_corpus` are a
                 list. The `candidate_corpus` is an iterable of machine translated corpus. The `reference_corpus` is
                 an iterable of iterables of reference corpus.
 
diff --git a/mindspore/nn/metrics/cosine_similarity.py b/mindspore/nn/metrics/cosine_similarity.py
index 4e3c911cd8..717f1f87fa 100644
--- a/mindspore/nn/metrics/cosine_similarity.py
+++ b/mindspore/nn/metrics/cosine_similarity.py
@@ -57,14 +57,14 @@ class CosineSimilarity(Metric):
         self.sqr_mtx_res = 0
         self._is_update = False
 
-    def update(self, *inputs):
+    def update(self, inputs):
         """
         Updates the internal evaluation result with 'input1'.
 
         Args:
             inputs: input_data `input1`. The input_data is a `Tensor` or an array.
         """
-        input_data = self._convert_data(inputs[0])
+        input_data = self._convert_data(inputs)
 
         if self.similarity == 'cosine':
             data = np.linalg.norm(input_data, ord=2, axis=1)
diff --git a/mindspore/nn/metrics/mean_surface_distance.py b/mindspore/nn/metrics/mean_surface_distance.py
index 05fcab9485..207de3321a 100644
--- a/mindspore/nn/metrics/mean_surface_distance.py
+++ b/mindspore/nn/metrics/mean_surface_distance.py
@@ -99,6 +99,9 @@ class MeanSurfaceDistance(Metric):
         y = self._convert_data(inputs[1])
         label_idx = inputs[2]
 
+        if not isinstance(label_idx, int):
+            raise TypeError("The data type of label_idx must be int, but got {}.".format(type(label_idx)))
+
         if y_pred.size == 0 or y_pred.shape != y.shape:
             raise ValueError("y_pred and y should have same shape, but got {}, {}.".format(y_pred.shape, y.shape))
 
diff --git a/mindspore/nn/metrics/roc.py b/mindspore/nn/metrics/roc.py
index 2b871faf14..c57ecec8e6 100644
--- a/mindspore/nn/metrics/roc.py
+++ b/mindspore/nn/metrics/roc.py
@@ -25,10 +25,10 @@ class ROC(Metric):
 
     Args:
         class_num (int): Integer with the number of classes. For the problem of binary classification, it is not
-                         necessary to provide this argument. Default: None.
+            necessary to provide this argument. Default: None.
         pos_label (int): Determine the integer of positive class. Default: None. For binary problems, it is translated
-                         to 1. For multiclass problems, this argument should not be set, as it is iteratively changed
-                         in the range [0,num_classes-1]. Default: None.
+            to 1. For multiclass problems, this argument should not be set, as it is iteratively changed in the
+            range [0,num_classes-1]. Default: None.
 
     Examples:
         >>> 1) binary classification example
@@ -163,10 +163,10 @@ class ROC(Metric):
             A tuple, composed of `fpr`, `tpr`, and `thresholds`.
 
             - **fpr** (np.array) - np.array with false positive rates. If multiclass, this is a list of such np.array,
-                                   one for each class.
+                one for each class.
             - **tps** (np.array) - np.array with true positive rates. If multiclass, this is a list of such np.array,
-                                   one for each class.
-            - **thresholds** (np.array) - thresholds used for computing false- and true postive rates.
+                one for each class.
+            - **thresholds** (np.array) - thresholds used for computing false- and true positive rates.
         """
         if self._is_update is False:
             raise RuntimeError('Call the update method before calling eval.')
diff --git a/mindspore/nn/metrics/root_mean_square_surface_distance.py b/mindspore/nn/metrics/root_mean_square_surface_distance.py
index 14069032a5..9461765303 100644
--- a/mindspore/nn/metrics/root_mean_square_surface_distance.py
+++ b/mindspore/nn/metrics/root_mean_square_surface_distance.py
@@ -101,6 +101,9 @@ class RootMeanSquareDistance(Metric):
         y = self._convert_data(inputs[1])
         label_idx = inputs[2]
 
+        if not isinstance(label_idx, int):
+            raise TypeError("The data type of label_idx must be int, but got {}.".format(type(label_idx)))
+
         if y_pred.size == 0 or y_pred.shape != y.shape:
             raise ValueError("y_pred and y should have same shape, but got {}, {}.".format(y_pred.shape, y.shape))
 
diff --git a/mindspore/nn/optim/ada_grad.py b/mindspore/nn/optim/ada_grad.py
index cfea86d260..3119d38e23 100644
--- a/mindspore/nn/optim/ada_grad.py
+++ b/mindspore/nn/optim/ada_grad.py
@@ -68,8 +68,7 @@ class Adagrad(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 2c126ec7be..41094ec0e9 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -236,8 +236,7 @@ class Adam(Optimizer):
               which in the 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use the dynamic learning rate, then
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index 8ea4789cf3..7ff85985af 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -129,8 +129,7 @@ class FTRL(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used.This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
         learning_rate (float): The learning rate value, must be zero or positive, dynamic learning rate is currently
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index b7be68532b..12fa325af5 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -201,8 +201,7 @@ class Lamb(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py
index bd49da6165..98562be94a 100644
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -155,8 +155,7 @@ class LazyAdam(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 83bd19dcea..5078e87f72 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -84,8 +84,7 @@ class Momentum(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index 9235638320..c008e35e9f 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -50,9 +50,9 @@ class Optimizer(Cell):
         weight_decay is positive. For most optimizer, when not separating parameters, the `weight_decay` in the API will
         be applied on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
 
-        When separating parameter groups, if you want to centralize the gradient, set a to True, but the gradient
-        centralization can only be applied to the parameters of the convolution layer. If the parameters of the non
-        convolution layer are set to True, an error will be reported. Default: False.
+        When separating parameter groups, if you want to centralize the gradient, set grad_centralization to True,
+        but the gradient centralization can only be applied to the parameters of the convolution layer.
+        If the parameters of the non convolution layer are set to True, an error will be reported. Default: False.
 
         To improve parameter groups performance, the customized order of parameters can be supported.
 
@@ -81,7 +81,7 @@ class Optimizer(Cell):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used.
+              If not, the `grad_centralization` is False by default.
 
         weight_decay (float): A floating point value for the weight decay. It must be equal to or greater than 0.
             If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
diff --git a/mindspore/nn/optim/proximal_ada_grad.py b/mindspore/nn/optim/proximal_ada_grad.py
index bf8789c1a4..6c0e6e948c 100644
--- a/mindspore/nn/optim/proximal_ada_grad.py
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@@ -86,8 +86,7 @@ class ProximalAdagrad(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index 726d299371..559960939c 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -106,8 +106,7 @@ class RMSProp(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index e1fcd79651..fc215da931 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -81,8 +81,7 @@ class SGD(Optimizer):
               in the value of 'order_params' must be in one of group parameters.
 
             - grad_centralization: Optional. If "grad_centralization" is in the keys, the set value will be used.
-              If not, the `grad_centralization` in the base class will be used. This parameter only works on the
-              convolution layer.
+              If not, the `grad_centralization` is False by default. This parameter only works on the convolution layer.
 
         learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
             When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then