Remove FC in dygraph, modify FC to Linear in sample code (#22082)

* modify fc to linear in sample code, test=develop * remove FC, test=develop * remove warnings, test=develop * drop fluid/imperative/README.md , test=develop * change fc to linear, test=develop * polish code style, test=develop
5 years ago · cf475f95df
parent 64a4044292
commit cf475f95df
19 changed files with 355 additions and 668 deletions
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@ -340,14 +340,14 @@ void BindImperative(py::module *m_ptr) {
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
                with fluid.dygraph.guard():
-                    fc = FC("fc", 64, num_flatten_dims=2)
+                    linear = Linear(32, 64)
                    data = to_variable(data)
-                    x = fc(data)
+                    x = linear(data)
                    print(x.numpy())
       )DOC")
@ -374,14 +374,14 @@ void BindImperative(py::module *m_ptr) {
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
                with fluid.dygraph.guard():
-                    fc = FC("fc", 64, num_flatten_dims=2)
+                    linear = Linear(32, 64)
                    data = to_variable(data)
-                    x = fc(data)
+                    x = linear(data)
                    y = x.detach()
       )DOC")
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@ -84,12 +84,12 @@ def _no_grad_(func):
        @fluid.dygraph.no_grad
        def test_layer():
            with fluid.dygraph.guard():
-                inp = np.ones([3, 32, 32], dtype='float32')
+                inp = np.ones([3, 1024], dtype='float32')
                t = fluid.dygraph.base.to_variable(inp)
-                fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
+                linear1 = fluid.Linear(1024, 4, bias_attr=False)
-                fc2 = fluid.FC('fc2', size=4)
+                linear2 = fluid.Linear(4, 4)
-                ret = fc1(t)
+                ret = linear1(t)
-                dy_ret = fc2(ret)
+                dy_ret = linear2(ret)
        test_layer()
@ -127,12 +127,12 @@ def guard(place=None):
        import paddle.fluid as fluid
        with fluid.dygraph.guard():
-            inp = np.ones([3, 32, 32], dtype='float32')
+            inp = np.ones([3, 1024], dtype='float32')
            t = fluid.dygraph.base.to_variable(inp)
-            fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
+            linear1 = fluid.Linear(1024, 4, bias_attr=False)
-            fc2 = fluid.FC('fc2', size=4)
+            linear2 = fluid.Linear(4, 4)
-            ret = fc1(t)
+            ret = linear1(t)
-            dy_ret = fc2(ret)
+            dy_ret = linear2(ret)
    """
    train = framework.Program()
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@ -29,10 +29,9 @@ import numbers
 import logging
 __all__ = [
-    'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'Linear', 'BatchNorm', 'Embedding',
+    'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Embedding', 'GRUUnit',
-    'GRUUnit', 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct',
+    'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
-    'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm', 'SpectralNorm',
+    'Conv3DTranspose', 'GroupNorm', 'SpectralNorm', 'TreeConv'
    'TreeConv'
 ]
@ -865,7 +864,7 @@ class Linear(layers.Layer):
    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
-    Different from FC layer, Linear layer takes only one ``Tensor`` input.
+    Linear layer takes only one ``Tensor`` input.
    The Linear layer multiplies input tensor with weight matrix and
    produces an output Tensor of shape [N, *, `output_dim`],
    where N is batch size and `*` means any number of additional dimensions.
@ -959,221 +958,6 @@ class Linear(layers.Layer):
        return self._helper.append_activation(pre_activation, act=self._act)
 class FC(layers.Layer):
    """
    This interface is used to construct a callable object of the ``FC`` class.
    For more details, refer to code examples.
    It creates a fully connected layer in the network. It can take
    one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
    which represents a fully connected weight matrix from each input unit to
    each output unit. The fully connected layer multiplies each input tensor
    with its corresponding weight to produce an output Tensor with shape [N, `size`],
    where N is batch size. If multiple input tensors are given, the results of
    multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
    is not None, a bias variable will be created and added to the output.
    Finally, if ``act`` is not None, it will be applied to the output as well.
    When the input is single ``Tensor`` :
    .. math::
        Out = Act({XW + b})
    When the input are multiple ``Tensor`` :
    .. math::
        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
    In the above equation:
    * :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
    * :math:`X_i`: The i-th input ``Tensor`` .
    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
    * :math:`b`: The bias parameter created by this layer (if needed).
    * :math:`Act`: The activation function.
    * :math:`Out`: The output ``Tensor`` .
    See below for an example.
    .. code-block:: text
        Given:
            data_1.data = [[[0.1, 0.2]]]
            data_1.shape = (1, 1, 2) # 1 is batch_size
            data_2.data = [[[0.1, 0.2, 0.3]]]
            data_2.shape = (1, 1, 3) # 1 is batch_size
            fc = FC("fc", 2, num_flatten_dims=2)
            out = fc(input=[data_1, data_2])
        Then:
            out.data = [[[0.182996 -0.474117]]]
            out.shape = (1, 1, 2)
    Parameters:
        name_scope(str): The name of this class.
        size(int): The number of output units in this layer.
        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
            two dimensions. If this happens, the multi-dimension tensor will first be flattened
            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
            dimensions will be flatten to form the first dimension of the final matrix (height of
            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
            form the second dimension of the final matrix (width of the matrix). For example, suppose
            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
        param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
            weights(Parameter) of this layer. Default: None.
        bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
            If it is set to None, the bias is initialized zero. Default: None.
        act (str, optional): Activation to be applied to the output of this layer. Default: None.
        is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
    Attribute:
        **weight** (list of Parameter): the learnable weights of this layer.
        **bias** (Parameter or None): the learnable bias of this layer.
    Returns:
        None
    Examples:
        .. code-block:: python
          from paddle.fluid.dygraph.base import to_variable
          import paddle.fluid as fluid
          from paddle.fluid.dygraph import FC
          import numpy as np
          data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
          with fluid.dygraph.guard():
              fc = FC("fc", 64, num_flatten_dims=2)
              data = to_variable(data)
              conv = fc(data)
    """
    def __init__(self,
                 name_scope,
                 size,
                 num_flatten_dims=1,
                 param_attr=None,
                 bias_attr=None,
                 act=None,
                 is_test=False,
                 dtype="float32"):
        super(FC, self).__init__(name_scope, dtype)
        self._size = size
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._act = act
        self.__w = list()
    def _build_once(self, input):
        i = 0
        for inp, param in self._helper.iter_inputs_and_params(input,
                                                              self._param_attr):
            input_shape = inp.shape
            param_shape = [
                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
                       1)
            ] + [self._size]
            self.__w.append(
                self.add_parameter(
                    '_w%d' % i,
                    self.create_parameter(
                        attr=param,
                        shape=param_shape,
                        dtype=self._dtype,
                        is_bias=False)))
            i += 1
        size = list([self._size])
        self._b = self.create_parameter(
            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
    # TODO(songyouwei): We should remove _w property
    @property
    def _w(self, i=0):
        return self.__w[i]
    @_w.setter
    def _w(self, value, i=0):
        assert isinstance(self.__w[i], Variable)
        self.__w[i].set_value(value)
    @property
    def weight(self):
        if len(self.__w) > 1:
            return self.__w
        else:
            return self.__w[0]
    @weight.setter
    def weight(self, value):
        if len(self.__w) == 1:
            self.__w[0] = value
    @property
    def bias(self):
        return self._b
    @bias.setter
    def bias(self, value):
        self._b = value
    def forward(self, input):
        mul_results = list()
        i = 0
        for inp, param in self._helper.iter_inputs_and_params(input,
                                                              self._param_attr):
            tmp = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="mul",
                inputs={"X": inp,
                        "Y": self.__w[i]},
                outputs={"Out": tmp},
                attrs={
                    "x_num_col_dims": self._num_flatten_dims,
                    "y_num_col_dims": 1
                })
            i += 1
            mul_results.append(tmp)
        if len(mul_results) == 1:
            pre_bias = mul_results[0]
        else:
            pre_bias = self._helper.create_variable_for_type_inference(
                self._dtype)
            self._helper.append_op(
                type="sum",
                inputs={"X": mul_results},
                outputs={"Out": pre_bias},
                attrs={"use_mkldnn": False})
        if self._b:
            pre_activation = self._helper.create_variable_for_type_inference(
                dtype=self._dtype)
            self._helper.append_op(
                type='elementwise_add',
                inputs={'X': [pre_bias],
                        'Y': [self._b]},
                outputs={'Out': [pre_activation]},
                attrs={'axis': self._num_flatten_dims})
        else:
            pre_activation = pre_bias
        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(pre_activation, act=self._act)
 class BatchNorm(layers.Layer):
    """
    This interface is used to construct a callable object of the ``BatchNorm`` class.
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@ -97,7 +97,7 @@ class DataParallel(layers.Layer):
           import paddle.fluid as fluid
           import paddle.fluid.dygraph as dygraph
           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import FC
+           from paddle.fluid.dygraph.nn import Linear
           from paddle.fluid.dygraph.base import to_variable
           place = fluid.CUDAPlace(0)
@ -106,28 +106,28 @@ class DataParallel(layers.Layer):
               # prepare the data parallel context
               strategy=dygraph.parallel.prepare_context()
-               fc_layer = FC("FC", 10, act="softmax")
+               linear = Linear(1, 10, act="softmax")
               adam = fluid.optimizer.AdamOptimizer()
               # make the module become the data parallelism module
-               fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy)
+               linear = dygraph.parallel.DataParallel(linear, strategy)
               x_data = np.random.random(size=[10, 1]).astype(np.float32)
               data = to_variable(x_data)
-               hidden = fc_layer(data)
+               hidden = linear(data)
               avg_loss = fluid.layers.mean(hidden)
               # scale the loss according to the number of trainers.
-               avg_loss = fc_layer.scale_loss(avg_loss)
+               avg_loss = linear.scale_loss(avg_loss)
               avg_loss.backward()
               # collect the gradients of trainers.
-               fc_layer.apply_collective_grads()
+               linear.apply_collective_grads()
               adam.minimize(avg_loss)
-               fc_layer.clear_gradients()
+               linear.clear_gradients()
    Args:
        layers(Layer): The module that should be executed by data parallel.
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@ -39,17 +39,17 @@ def monkey_patch_varbase():
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
-                data = np.ones([3, 32, 32], dtype='float32')
+                data = np.ones([3, 1024], dtype='float32')
                with fluid.dygraph.guard():
-                    fc = fluid.dygraph.FC("fc", 4)
+                    linear = fluid.dygraph.Linear(1024, 4)
                    t = to_variable(data)
-                    fc(t)  # call with default weight
+                    linear(t)  # call with default weight
                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    fc.weight.set_value(custom_weight)  # change existing weight
+                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = fc(t)  # call with different weight
+                    out = linear(t)  # call with different weight
        """
        assert isinstance(value, (np.ndarray, core.VarBase)), \
--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ b/python/paddle/fluid/dygraph_grad_clip.py
@ -65,7 +65,7 @@ class GradClipByValue(GradClipBase):
            import paddle.fluid as fluid
            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
+            from paddle.fluid.dygraph.nn import Linear
            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
@ -77,9 +77,9 @@ class GradClipByValue(GradClipBase):
                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-                fc = FC( "fc", 10)
+                linear = Linear( 10, 10)
-                out = fc( to_variable(init_value) )
+                out = linear( to_variable(init_value) )
                loss = fluid.layers.reduce_mean( out )
@ -144,7 +144,7 @@ class GradClipByNorm(GradClipBase):
            import paddle.fluid as fluid
            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
+            from paddle.fluid.dygraph.nn import Linear
            from paddle.fluid.clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
@ -156,9 +156,9 @@ class GradClipByNorm(GradClipBase):
                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-                fc = FC( "fc", 10)
+                linear = Linear( 10, 10)
-                out = fc( to_variable(init_value) )
+                out = linear( to_variable(init_value) )
                loss = fluid.layers.reduce_mean( out )
@ -222,7 +222,7 @@ class GradClipByGlobalNorm(GradClipBase):
            import paddle.fluid as fluid
            from paddle.fluid.dygraph.base import to_variable
-            from paddle.fluid.dygraph.nn import FC
+            from paddle.fluid.dygraph.nn import Linear
            from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
@ -234,9 +234,9 @@ class GradClipByGlobalNorm(GradClipBase):
                init_value = np.random.uniform( -1, 1, (10, 10)).astype('float32')
-                fc = FC( "fc", 10)
+                linear = Linear( 10, 10)
-                out = fc( to_variable(init_value) )
+                out = linear( to_variable(init_value) )
                loss = fluid.layers.reduce_mean( out )
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -959,14 +959,14 @@ class Variable(object):
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
                with fluid.dygraph.guard():
-                    fc = FC("fc", 64, num_flatten_dims=2)
+                    linear = Linear(32, 64)
                    data = to_variable(data)
-                    x = fc(data)
+                    x = linear(data)
                    y = x.detach()
        """
@ -991,14 +991,14 @@ class Variable(object):
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
                with fluid.dygraph.guard():
-                    fc = FC("fc", 64, num_flatten_dims=2)
+                    linear = Linear(32, 64)
                    data = to_variable(data)
-                    x = fc(data)
+                    x = linear(data)
                    print(x.numpy())
        """
@ -1020,17 +1020,17 @@ class Variable(object):
                import paddle.fluid as fluid
                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import FC
+                from paddle.fluid.dygraph import Linear
                import numpy as np
-                data = np.ones([3, 32, 32], dtype='float32')
+                data = np.ones([3, 1024], dtype='float32')
                with fluid.dygraph.guard():
-                    fc = fluid.dygraph.FC("fc", 4)
+                    linear = fluid.dygraph.Linear(1024, 4)
                    t = to_variable(data)
-                    fc(t)  # call with default weight
+                    linear(t)  # call with default weight
                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    fc.weight.set_value(custom_weight)  # change existing weight
+                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = fc(t)  # call with different weight
+                    out = linear(t)  # call with different weight
        """
        pass
@ -1223,18 +1223,18 @@ class Variable(object):
                value0 = np.arange(26).reshape(2, 13).astype("float32")
                value1 = np.arange(6).reshape(2, 3).astype("float32")
                value2 = np.arange(10).reshape(2, 5).astype("float32")
-                fc = fluid.FC("fc1", size=5, dtype="float32")
+                linear = fluid.Linear(13, 5, dtype="float32")
-                fc2 = fluid.FC("fc2", size=3, dtype="float32")
+                linear2 = fluid.Linear(3, 3, dtype="float32")
                a = fluid.dygraph.to_variable(value0)
                b = fluid.dygraph.to_variable(value1)
                c = fluid.dygraph.to_variable(value2)
-                out1 = fc(a)
+                out1 = linear(a)
-                out2 = fc2(b)
+                out2 = linear2(b)
                out1.stop_gradient = True
                out = fluid.layers.concat(input=[out1, out2, c], axis=1)
                out.backward()
-                assert (fc._w.gradient() == 0).all()
+                assert (linear.weight.gradient() == 0).all()
                assert (out1.gradient() == 0).all()
        """
        if in_dygraph_mode():
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@ -30,14 +30,15 @@ __all__ = ['run_check']
 class SimpleLayer(Layer):
-    def __init__(self, name_scope):
+    def __init__(self, input_size):
-        super(SimpleLayer, self).__init__(name_scope)
+        super(SimpleLayer, self).__init__()
-        self._fc1 = nn.FC(self.full_name(),
+        self._linear1 = nn.Linear(
            input_size,
            3,
            param_attr=ParamAttr(initializer=Constant(value=0.1)))
    def forward(self, inputs):
-        x = self._fc1(inputs)
+        x = self._linear1(inputs)
        x = layers.reduce_sum(x)
        return x
@ -79,7 +80,7 @@ def run_check():
                    build_strategy = compiler.BuildStrategy()
                    build_strategy.enable_inplace = True
                    inp = layers.data(name="inp", shape=[2, 2])
-                    simple_layer = SimpleLayer("simple_layer")
+                    simple_layer = SimpleLayer(input_size=2)
                    out = simple_layer(inp)
                    exe = executor.Executor(
                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
@ -108,10 +109,11 @@ def run_check():
                with unique_name.guard():
                    inp0 = layers.data(
                        name="inp", shape=[2, 2], append_batch_size=False)
-                    simple_layer0 = SimpleLayer("simple_layer")
+                    simple_layer0 = SimpleLayer(input_size=2)
                    out0 = simple_layer0(inp0)
                    param_grads = backward.append_backward(
-                        out0, parameter_list=[simple_layer0._fc1._w.name])[0]
+                        out0,
                        parameter_list=[simple_layer0._linear1.weight.name])[0]
                    exe0 = executor.Executor(
                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
                        (core.get_cuda_device_count() > 0) else core.CPUPlace())
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -3002,7 +3002,7 @@ def layer_norm(input,
            print(output)
    """
    assert in_dygraph_mode(
-    ) is not True, "please use FC instead of fc in dygraph mode!"
+    ) is not True, "please use LayerNorm instead of layer_norm in dygraph mode!"
    helper = LayerHelper('layer_norm', **locals())
    dtype = helper.input_dtype()
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@ -17,8 +17,7 @@ from __future__ import print_function
 import numpy as np
 import paddle.fluid as fluid
-from paddle.fluid import FC
+from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import FC
 from paddle.fluid.dygraph.base import to_variable
 import unittest
@ -33,37 +32,37 @@ class Test_Detach(unittest.TestCase):
    def no_detach_multi(self):
        data = self.generate_Data()
        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
+            linear_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
+            linear_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
+            linear = Linear(
                4,
                10,
-                    num_flatten_dims=1,
+                param_attr=linear_w_param_attrs,
-                    param_attr=fc_w_param_attrs,
+                bias_attr=linear_b_param_attrs)
-                    bias_attr=fc_b_param_attrs)
+            linear1_w_param_attrs = fluid.ParamAttr(
            fc1_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
+            linear1_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
+            linear1 = Linear(
                10,
                1,
-                     num_flatten_dims=1,
+                param_attr=linear1_w_param_attrs,
-                     param_attr=fc1_w_param_attrs,
+                bias_attr=linear1_b_param_attrs)
-                     bias_attr=fc1_b_param_attrs)
+            linear2_w_param_attrs = fluid.ParamAttr(
            fc2_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(9.0))
-            fc2_b_param_attrs = fluid.ParamAttr(
+            linear2_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(10.0))
-            fc2 = FC("fc",
+            linear2 = Linear(
                10,
                1,
-                     num_flatten_dims=1,
+                param_attr=linear2_w_param_attrs,
-                     param_attr=fc2_w_param_attrs,
+                bias_attr=linear2_b_param_attrs)
                     bias_attr=fc2_b_param_attrs)
            data = to_variable(data)
-            x = fc(data)
+            x = linear(data)
-            x1 = fc1(x)
+            x1 = linear1(x)
-            x2 = fc2(x)
+            x2 = linear2(x)
            loss = x1 + x2
            # print(loss, loss.shape)
            loss.backward()
@ -72,27 +71,27 @@ class Test_Detach(unittest.TestCase):
    def no_detach_single(self):
        data = self.generate_Data()
        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
+            linear_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
+            linear_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
+            linear = Linear(
                4,
                10,
-                    num_flatten_dims=1,
+                param_attr=linear_w_param_attrs,
-                    param_attr=fc_w_param_attrs,
+                bias_attr=linear_b_param_attrs)
-                    bias_attr=fc_b_param_attrs)
+            linear1_w_param_attrs = fluid.ParamAttr(
            fc1_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
+            linear1_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
+            linear1 = Linear(
                10,
                1,
-                     num_flatten_dims=1,
+                param_attr=linear1_w_param_attrs,
-                     param_attr=fc1_w_param_attrs,
+                bias_attr=linear1_b_param_attrs)
                     bias_attr=fc1_b_param_attrs)
            data = to_variable(data)
-            x = fc(data)
+            x = linear(data)
-            x1 = fc1(x)
+            x1 = linear1(x)
            loss = x1
            # print(loss, loss.shape)
            loss.backward()
@ -101,38 +100,38 @@ class Test_Detach(unittest.TestCase):
    def detach_multi(self):
        data = self.generate_Data()
        with fluid.dygraph.guard():
-            fc_w_param_attrs = fluid.ParamAttr(
+            linear_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(5.0))
-            fc_b_param_attrs = fluid.ParamAttr(
+            linear_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(6.0))
-            fc = FC("fc",
+            linear = Linear(
                4,
                10,
-                    num_flatten_dims=1,
+                param_attr=linear_w_param_attrs,
-                    param_attr=fc_w_param_attrs,
+                bias_attr=linear_b_param_attrs)
-                    bias_attr=fc_b_param_attrs)
+            linear1_w_param_attrs = fluid.ParamAttr(
            fc1_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(7.0))
-            fc1_b_param_attrs = fluid.ParamAttr(
+            linear1_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(8.0))
-            fc1 = FC("fc",
+            linear1 = Linear(
                10,
                1,
-                     num_flatten_dims=1,
+                param_attr=linear1_w_param_attrs,
-                     param_attr=fc1_w_param_attrs,
+                bias_attr=linear1_b_param_attrs)
-                     bias_attr=fc1_b_param_attrs)
+            linear2_w_param_attrs = fluid.ParamAttr(
            fc2_w_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(9.0))
-            fc2_b_param_attrs = fluid.ParamAttr(
+            linear2_b_param_attrs = fluid.ParamAttr(
                initializer=fluid.initializer.Constant(10.0))
-            fc2 = FC("fc",
+            linear2 = Linear(
                10,
                1,
-                     num_flatten_dims=1,
+                param_attr=linear2_w_param_attrs,
-                     param_attr=fc2_w_param_attrs,
+                bias_attr=linear2_b_param_attrs)
                     bias_attr=fc2_b_param_attrs)
            data = to_variable(data)
-            x = fc(data)
+            x = linear(data)
            x_detach = x.detach()
-            x1 = fc1(x)
+            x1 = linear1(x)
-            x2 = fc2(x_detach)
+            x2 = linear2(x_detach)
            loss = x1 + x2
            # print(loss, loss.shape)
            loss.backward()
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@ -18,7 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 class SimpleImgConvPool(fluid.dygraph.Layer):
@ -71,8 +71,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 class MNIST(fluid.dygraph.Layer):
-    def __init__(self, name_scope, dtype="float32"):
+    def __init__(self, dtype="float32"):
-        super(MNIST, self).__init__(name_scope)
+        super(MNIST, self).__init__()
        self._simple_img_conv_pool_1 = SimpleImgConvPool(
            num_channels=3,
@ -94,10 +94,11 @@ class MNIST(fluid.dygraph.Layer):
            dtype=dtype,
            use_cudnn=True)
-        pool_2_shape = 50 * 4 * 4
+        self.pool_2_shape = 50 * 53 * 53
        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
+        self._linear = Linear(
            self.pool_2_shape,
            10,
            param_attr=fluid.param_attr.ParamAttr(
                initializer=fluid.initializer.NormalInitializer(
@ -108,7 +109,8 @@ class MNIST(fluid.dygraph.Layer):
    def forward(self, inputs, label):
        x = self._simple_img_conv_pool_1(inputs)
        x = self._simple_img_conv_pool_2(x)
-        cost = self._fc(x)
+        x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
        cost = self._linear(x)
        loss = fluid.layers.cross_entropy(cost, label)
        avg_loss = fluid.layers.mean(loss)
        return avg_loss
@ -123,7 +125,7 @@ class TestMnist(unittest.TestCase):
        x = np.random.randn(1, 3, 224, 224).astype("float16")
        y = np.random.randn(1, 1).astype("int64")
        with fluid.dygraph.guard(fluid.CUDAPlace(0)):
-            model = MNIST("mnist", dtype="float16")
+            model = MNIST(dtype="float16")
            x = fluid.dygraph.to_variable(x)
            y = fluid.dygraph.to_variable(y)
            loss = model(x, y)
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid import FC
+from paddle.fluid import Linear
 from test_imperative_base import new_program_scope
@ -35,15 +35,17 @@ class MyLayer(fluid.Layer):
 class MLP(fluid.Layer):
-    def __init__(self, name_scope):
+    def __init__(self, input_size):
-        super(MLP, self).__init__(name_scope)
+        super(MLP, self).__init__()
-        self._fc1 = FC(self.full_name(),
+        self._linear1 = Linear(
            input_size,
            3,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(self.full_name(),
+        self._linear2 = Linear(
            3,
            4,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
@ -51,8 +53,8 @@ class MLP(fluid.Layer):
                initializer=fluid.initializer.Constant(value=0.1)))
    def forward(self, inputs):
-        x = self._fc1(inputs)
+        x = self._linear1(inputs)
-        x = self._fc2(x)
+        x = self._linear2(x)
        x = fluid.layers.reduce_sum(x)
        return x
@ -338,29 +340,29 @@ class TestImperative(unittest.TestCase):
        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
        with fluid.dygraph.guard():
            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            mlp = MLP("mlp")
+            mlp = MLP(input_size=2)
            out = mlp(var_inp)
            dy_out = out.numpy()
            out.backward()
-            dy_grad = mlp._fc1.weight.gradient()
+            dy_grad = mlp._linear1.weight.gradient()
        with fluid.dygraph.guard():
            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            mlp2 = MLP("mlp")
+            mlp2 = MLP(input_size=2)
            out2 = mlp2(var_inp2)
            dy_out2 = out2.numpy()
            backward_strategy = fluid.dygraph.BackwardStrategy()
            backward_strategy.sort_sum_gradient = True
            out2.backward(backward_strategy)
-            dy_grad2 = mlp2._fc1.weight.gradient()
+            dy_grad2 = mlp2._linear1.weight.gradient()
        with new_program_scope():
            inp = fluid.layers.data(
                name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP("mlp")
+            mlp = MLP(input_size=2)
            out = mlp(inp)
            param_grads = fluid.backward.append_backward(
-                out, parameter_list=[mlp._fc1.weight.name])[0]
+                out, parameter_list=[mlp._linear1.weight.name])[0]
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            exe.run(fluid.default_startup_program())
@ -375,15 +377,15 @@ class TestImperative(unittest.TestCase):
        self.assertTrue(np.allclose(dy_grad2, static_grad))
        params = mlp.parameters(True)
-        self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
+        self.assertEqual("linear_0.w_0", params[0].name)
-        self.assertEqual("mlp/MLP_0/FC_0.b_0", params[1].name)
+        self.assertEqual("linear_0.b_0", params[1].name)
-        self.assertEqual("mlp/MLP_0/FC_1.w_0", params[2].name)
+        self.assertEqual("linear_1.w_0", params[2].name)
-        self.assertEqual("mlp/MLP_0/FC_1.b_0", params[3].name)
+        self.assertEqual("linear_1.b_0", params[3].name)
        self.assertEqual(len(params), 4)
        sublayers = mlp.sublayers(True)
-        self.assertEqual(mlp._fc1, sublayers[0])
+        self.assertEqual(mlp._linear1, sublayers[0])
-        self.assertEqual(mlp._fc2, sublayers[1])
+        self.assertEqual(mlp._linear2, sublayers[1])
        self.assertEqual(len(sublayers), 2)
    def test_dygraph_vs_static(self):
--- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
@ -20,17 +20,17 @@ import numpy as np
 class MLP(fluid.Layer):
-    def __init__(self, name_scope):
+    def __init__(self, input_size):
-        super(MLP, self).__init__(name_scope)
+        super(MLP, self).__init__()
-        self._fc1 = fluid.dygraph.FC(
+        self._linear1 = fluid.dygraph.Linear(
-            self.full_name(),
+            input_size,
            3,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = fluid.dygraph.FC(
+        self._linear2 = fluid.dygraph.Linear(
-            self.full_name(),
+            3,
            4,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
@ -38,8 +38,8 @@ class MLP(fluid.Layer):
                initializer=fluid.initializer.Constant(value=0.1)))
    def forward(self, inputs):
-        x = self._fc1(inputs)
+        x = self._linear1(inputs)
-        x = self._fc2(x)
+        x = self._linear2(x)
        x = fluid.layers.reduce_sum(x)
        return x
@ -51,7 +51,7 @@ class TestDygraphDebugString(unittest.TestCase):
        trace_var = 0
        alive_var = 0
        with fluid.dygraph.guard():
-            mlp = MLP("mlp")
+            mlp = MLP(input_size=2)
            for i in range(10):
                var_inp = fluid.dygraph.base.to_variable(np_inp)
                out = mlp(var_inp)
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@ -21,17 +21,17 @@ from test_imperative_base import new_program_scope
 class MLP(fluid.Layer):
-    def __init__(self, name_scope):
+    def __init__(self, input_size):
-        super(MLP, self).__init__(name_scope)
+        super(MLP, self).__init__()
-        self._fc1 = fluid.dygraph.FC(
+        self._linear1 = fluid.dygraph.Linear(
-            self.full_name(),
+            input_size,
            3,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = fluid.dygraph.FC(
+        self._linear2 = fluid.dygraph.Linear(
-            self.full_name(),
+            3,
            4,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
@ -39,8 +39,8 @@ class MLP(fluid.Layer):
                initializer=fluid.initializer.Constant(value=0.1)))
    def forward(self, inputs):
-        x = self._fc1(inputs)
+        x = self._linear1(inputs)
-        x = self._fc2(x)
+        x = self._linear2(x)
        x = fluid.layers.reduce_sum(x)
        return x
@ -48,7 +48,7 @@ class MLP(fluid.Layer):
 class TestDygraphFramework(unittest.TestCase):
    def test_dygraph_backward(self):
        with new_program_scope():
-            mlp = MLP("mlp")
+            mlp = MLP(input_size=2)
            var_inp = fluid.layers.data(
                "input", shape=[2, 2], dtype="float32", append_batch_size=False)
            out = mlp(var_inp)
--- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
@ -24,30 +24,30 @@ class TestImperativePartitialBackward(unittest.TestCase):
        with fluid.dygraph.guard():
            x = np.random.randn(2, 4, 5).astype("float32")
            x = fluid.dygraph.to_variable(x)
-            fc1 = fluid.dygraph.FC("fc1", 10, num_flatten_dims=2)
+            linear1 = fluid.dygraph.Linear(5, 10)
-            fc2 = fluid.dygraph.FC("fc2", 10, num_flatten_dims=2)
+            linear2 = fluid.dygraph.Linear(5, 10)
-            y = fc1(x[:, :2])
+            y = linear1(x[:, :2])
-            z = fc2(x[:, 2:])
+            z = linear2(x[:, 2:])
            loss = fluid.layers.reduce_mean(y)
            loss.backward()
-            for param in fc1.parameters():
+            for param in linear1.parameters():
                self.assertIsNotNone(param._grad_ivar())
-            for param in fc2.parameters():
+            for param in linear2.parameters():
                self.assertIsNone(param._grad_ivar())
            optimizer = fluid.optimizer.AdamOptimizer(parameter_list=(
-                fc1.parameters() + fc2.parameters()))
+                linear1.parameters() + linear2.parameters()))
            _, params_grads = optimizer.minimize(loss)
            self.assertListEqual(
-                sorted([p.name for p in fc1.parameters()]),
+                sorted([p.name for p in linear1.parameters()]),
                sorted([p_g[0].name for p_g in params_grads]))
-            fc1.clear_gradients()
+            linear1.clear_gradients()
-            fc2.clear_gradients()
+            linear2.clear_gradients()
 if __name__ == '__main__':
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@ -23,18 +23,18 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 import paddle.fluid.dygraph.nn as nn
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 class Policy(fluid.dygraph.Layer):
-    def __init__(self, name_scope):
+    def __init__(self, input_size):
-        super(Policy, self).__init__(name_scope)
+        super(Policy, self).__init__()
-        self.affine1 = nn.FC(self.full_name(), size=128)
+        self.affine1 = nn.Linear(input_size, 128)
-        self.affine2 = nn.FC(self.full_name(), size=2)
+        self.affine2 = nn.Linear(128, 2)
        self.dropout_ratio = 0.6
        self.saved_log_probs = []
@ -67,7 +67,7 @@ class TestImperativeMnist(unittest.TestCase):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
-            policy = Policy("PolicyModel")
+            policy = Policy(input_size=4)
            dy_state = fluid.dygraph.base.to_variable(state)
            dy_state.stop_gradient = True
@ -111,7 +111,7 @@ class TestImperativeMnist(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            policy = Policy("PolicyModel")
+            policy = Policy(input_size=4)
            st_sgd = SGDOptimizer(learning_rate=1e-3)
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@ -131,14 +131,13 @@ class SimpleLSTMRNN(fluid.Layer):
 class PtbModel(fluid.Layer):
    def __init__(self,
                 name_scope,
                 hidden_size,
                 vocab_size,
                 num_layers=2,
                 num_steps=20,
                 init_scale=0.1,
                 dropout=None):
-        super(PtbModel, self).__init__(name_scope)
+        super(PtbModel, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.init_scale = init_scale
@ -160,7 +159,18 @@ class PtbModel(fluid.Layer):
                initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale)))
-        self.out_project = Linear(self.hidden_size, self.vocab_size)
+        self.softmax_weight = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.hidden_size, self.vocab_size],
            dtype="float32",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
        self.softmax_bias = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.vocab_size],
            dtype="float32",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
    def forward(self, input, label, init_hidden, init_cell):
        init_h = fluid.layers.reshape(
@ -182,7 +192,8 @@ class PtbModel(fluid.Layer):
        rnn_out = fluid.layers.reshape(
            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
-        projection = self.out_project(rnn_out)
+        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
        projection = fluid.layers.reshape(
            projection, shape=[-1, self.vocab_size])
        loss = fluid.layers.softmax_with_cross_entropy(
@ -210,7 +221,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -294,7 +304,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -400,7 +409,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -505,7 +513,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -614,7 +621,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -694,7 +700,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@ -786,7 +791,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -85,30 +85,25 @@ class LayerTest(unittest.TestCase):
 class TestLayer(LayerTest):
    def test_custom_layer_with_kwargs(self):
        class CustomLayer(fluid.Layer):
-            def __init__(self, name_scope, fc1_size=4):
+            def __init__(self, input_size, linear1_size=4):
-                super(CustomLayer, self).__init__(name_scope)
+                super(CustomLayer, self).__init__()
-                self.fc1 = nn.FC('fc1',
+                self.linear1 = nn.Linear(
-                                 size=fc1_size,
+                    input_size, linear1_size, bias_attr=False)
-                                 bias_attr=False,
+                self.linear2 = nn.Linear(linear1_size, 1, bias_attr=False)
-                                 num_flatten_dims=1)
+
-                self.fc2 = nn.FC('fc2',
+            def forward(self, x, do_linear2=False):
-                                 size=1,
+                ret = self.linear1(x)
-                                 bias_attr=False,
+                if do_linear2:
-                                 num_flatten_dims=1)
+                    ret = self.linear2(ret)
            def forward(self, x, do_fc2=False):
                ret = self.fc1(x)
                if do_fc2:
                    ret = self.fc2(ret)
                return ret
        with self.dynamic_graph():
            inp = np.ones([3, 3], dtype='float32')
            x = base.to_variable(inp)
-            custom = CustomLayer('custom', fc1_size=2)
+            custom = CustomLayer(input_size=3, linear1_size=2)
-            ret = custom(x, do_fc2=False)
+            ret = custom(x, do_linear2=False)
            self.assertTrue(np.array_equal(ret.numpy().shape, [3, 2]))
-            ret = custom(x, do_fc2=True)
+            ret = custom(x, do_linear2=True)
            self.assertTrue(np.array_equal(ret.numpy().shape, [3, 1]))
    def test_linear(self):
@ -133,112 +128,6 @@ class TestLayer(LayerTest):
        self.assertTrue(np.array_equal(static_ret, dy_ret_value))
        inp = np.ones([3, 32], dtype='float32')
        with self.dynamic_graph():
            t = base.to_variable(inp)
            linear = nn.Linear(32, 4, bias_attr=False)
            dy_ret = linear(t)
            dy_ret_value = dy_ret.numpy()
        with self.dynamic_graph():
            t = base.to_variable(inp)
            fc = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
            dy_ret2 = fc(t)
            dy_ret_value2 = dy_ret2.numpy()
        self.assertTrue(np.array_equal(dy_ret_value, dy_ret_value2))
    def test_fc(self):
        inp = np.ones([3, 32, 32], dtype='float32')
        with self.static_graph():
            t = layers.data(
                name='data',
                shape=[3, 32, 32],
                dtype='float32',
                append_batch_size=False)
            ret = layers.fc(t, size=4, bias_attr=False, num_flatten_dims=1)
            ret2 = layers.fc(ret, size=4)
            static_ret = self.get_static_graph_result(
                feed={'data': inp}, fetch_list=[ret2])[0]
        with self.static_graph():
            t = layers.data(
                name='data',
                shape=[3, 32, 32],
                dtype='float32',
                append_batch_size=False)
            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
            fc2 = nn.FC('fc2', size=4)
            ret = fc1(t)
            ret2 = fc2(ret)
            static_ret2 = self.get_static_graph_result(
                feed={'data': inp}, fetch_list=[ret2])[0]
        with self.dynamic_graph():
            t = base.to_variable(inp)
            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
            fc2 = nn.FC('fc2', size=4)
            ret = fc1(t)
            dy_ret = fc2(ret)
            dy_ret_value = dy_ret.numpy()
        self.assertTrue(np.array_equal(static_ret, static_ret2))
        self.assertTrue(np.array_equal(static_ret, dy_ret_value))
        with self.dynamic_graph():
            custom_weight = np.random.randn(1024, 4).astype("float32")
            weight_attr1 = fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    custom_weight))
            fc1 = fluid.dygraph.FC("fc1",
                                   4,
                                   num_flatten_dims=1,
                                   param_attr=weight_attr1)
            out1 = fc1(base.to_variable(inp))
            loss1 = fluid.layers.reduce_mean(out1)
            fc1_weight_init = fc1.weight.detach()
            fc1_bias_init = fc1.bias.detach()
            loss1.backward()
            optimizer1 = fluid.optimizer.SGD(learning_rate=0.1,
                                             parameter_list=fc1.parameters())
            optimizer1.minimize(loss1)
            fc1_weight_updated = fc1.weight.detach()
        with self.dynamic_graph():
            weight_attr2 = fluid.ParamAttr(
                initializer=fluid.initializer.Uniform())
            fc2 = fluid.dygraph.FC("fc2",
                                   4,
                                   num_flatten_dims=1,
                                   param_attr=weight_attr2)
            out2 = fc2(base.to_variable(inp))
            self.assertFalse(
                np.array_equal(fc1_weight_init.numpy(), fc2.weight.numpy()))
            self.assertFalse(np.array_equal(out1.numpy(), out2.numpy()))
            mismatched_weight = np.random.randn(4, 4).astype("float32")
            with self.assertRaises(AssertionError):
                fc2.weight.set_value(mismatched_weight)
            fc2.weight.set_value(fc1_weight_init)
            fc2.bias.set_value(fc1_bias_init)
            out2 = fc2(base.to_variable(inp))
            loss2 = fluid.layers.reduce_mean(out2)
            loss2.backward()
            optimizer2 = fluid.optimizer.SGD(learning_rate=0.1,
                                             parameter_list=fc2.parameters())
            optimizer2.minimize(loss2)
            self.assertTrue(
                np.array_equal(fc2.weight.numpy(), fc1_weight_updated.numpy()))
            self.assertTrue(np.array_equal(out1.numpy(), out2.numpy()))
            fc2.weight = fc1.weight
            fc2.bias = fc1.bias
            self.assertTrue(
                np.array_equal(fc2.weight.numpy(), fc1.weight.numpy()))
            self.assertTrue(np.array_equal(fc2.bias.numpy(), fc1.bias.numpy()))
    def test_layer_norm(self):
        inp = np.ones([3, 32, 32], dtype='float32')
        with self.static_graph():