Add row_conv and hsigmoid into paddle.nn(functional and layer) (#23517)

* add approximation for gelu, test=develop * add functional conv * add test and doc for function convs, test=develop * update ConvTransposeOp's InferShape and error message, test=develop * add hsigmoid, row_conv in paddle.nn(functional and layer), test=develop * fix hyperlinks in docstring
6 years ago · 600cb8c828
parent 4231d84077
commit 600cb8c828
10 changed files with 844 additions and 58 deletions
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -5484,7 +5484,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
    """
    helper = LayerHelper('row_conv', **locals())
    dtype = helper.input_dtype()
-    filter_shape = [future_context_size + 1, input.shape[1]]
+    filter_shape = [future_context_size + 1, input.shape[-1]]
    filter_param = helper.create_parameter(
        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
    out = helper.create_variable_for_type_inference(dtype)
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid.py
@ -0,0 +1,219 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import numpy as np
+import unittest
+
+
+class HSigmoidTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName="runTest",
+                 batch_size=4,
+                 feature_size=6,
+                 num_classes=8,
+                 labels=None,
+                 path_code=None,
+                 path_table=None,
+                 is_sparse=False,
+                 dtype="float32"):
+        super(HSigmoidTestCase, self).__init__()
+        self.batch_size = batch_size
+        self.feature_size = feature_size
+        self.num_classes = num_classes
+        self.dtype = dtype
+        self.is_sparse = is_sparse
+
+        self.labels = labels
+        self.path_code = path_code
+        self.path_table = path_table
+        self.is_custom = path_code is not None and path_table is not None
+
+    def setUp(self):
+        input_shape = (self.batch_size, self.feature_size)
+        self.input = np.random.uniform(
+            -1, 1, size=input_shape).astype(self.dtype)
+        if self.labels is None:
+            self.labels = np.random.randint(
+                0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64)
+        C = self.num_classes if self.is_custom else self.num_classes - 1
+        self.weight_shape = (C, self.feature_size)
+        self.weight = np.random.randn(*self.weight_shape).astype(self.dtype)
+        self.bias_shape = (C, 1)
+        self.bias = np.random.randn(*self.bias_shape).astype(self.dtype)
+
+    def fluid_layer(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data(
+                    "input", [-1, self.feature_size], dtype=self.dtype)
+                label = fluid.data("labels", [-1, 1], dtype="int64")
+                if self.is_custom:
+                    path_table = fluid.data(
+                        "path_table", [-1, -1], dtype="int64")
+                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
+                else:
+                    path_table = path_code = None
+                y = fluid.layers.hsigmoid(
+                    x,
+                    label,
+                    self.num_classes,
+                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    bias_attr=I.NumpyArrayInitializer(self.bias),
+                    path_table=path_table,
+                    path_code=path_code,
+                    is_custom=self.is_custom,
+                    is_sparse=self.is_sparse, )
+        exe = fluid.Executor(place)
+        exe.run(start)
+        feed_dict = {"input": self.input, "labels": self.labels}
+        if self.is_custom:
+            feed_dict["path_code"] = self.path_code
+            feed_dict["path_table"] = self.path_table
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
+        return y_np
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data(
+                    "input", [-1, self.feature_size], dtype=self.dtype)
+                label = fluid.data("labels", [-1, 1], dtype="int64")
+                if self.is_custom:
+                    path_table = fluid.data(
+                        "path_table", [-1, -1], dtype="int64")
+                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
+                else:
+                    path_table = path_code = None
+                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
+                b = fluid.data("bias", self.bias_shape, dtype=self.dtype)
+                y = F.hsigmoid(
+                    x,
+                    label,
+                    w,
+                    b,
+                    self.num_classes,
+                    is_sparse=self.is_sparse,
+                    path_table=path_table,
+                    path_code=path_code)
+
+        exe = fluid.Executor(place)
+        exe.run(start)
+        feed_dict = {
+            "input": self.input,
+            "labels": self.labels,
+            "weight": self.weight,
+            "bias": self.bias
+        }
+        if self.is_custom:
+            feed_dict["path_code"] = self.path_code
+            feed_dict["path_table"] = self.path_table
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
+        return y_np
+
+    def nn_layer(self, place):
+        with dg.guard(place):
+            x_var = dg.to_variable(self.input)
+            label_var = dg.to_variable(self.labels)
+            if self.is_custom:
+                path_code_var = dg.to_variable(self.path_code)
+                path_table_var = dg.to_variable(self.path_table)
+            else:
+                path_code_var = path_table_var = None
+            hierarchical_softmax = nn.HSigmoid(
+                self.feature_size,
+                self.num_classes,
+                is_custom=self.is_custom,
+                is_sparse=self.is_sparse,
+                param_attr=I.NumpyArrayInitializer(self.weight),
+                bias_attr=I.NumpyArrayInitializer(self.bias),
+                dtype=self.dtype)
+            y_var = hierarchical_softmax(
+                x_var,
+                label_var,
+                path_table=path_table_var,
+                path_code=path_code_var)
+            y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.fluid_layer(place)
+        result2 = self.functional(place)
+        result3 = self.nn_layer(place)
+        np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+
+class HSigmoidTestErrorCase(HSigmoidTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.nn_layer()
+
+    def nn_layer(self):
+        x_var = dg.to_variable(self.input)
+        label_var = dg.to_variable(self.labels)
+        if self.is_custom:
+            path_code_var = dg.to_variable(self.path_code)
+            path_table_var = dg.to_variable(self.path_table)
+        else:
+            path_code_var = path_table_var = None
+        hierarchical_softmax = nn.HSigmoid(
+            self.feature_size,
+            self.num_classes,
+            is_custom=self.is_custom,
+            param_attr=I.NumpyArrayInitializer(self.weight),
+            bias_attr=I.NumpyArrayInitializer(self.bias),
+            dtype=self.dtype)
+        y_var = hierarchical_softmax(
+            x_var,
+            label_var,
+            path_table=path_table_var,
+            path_code=path_code_var)
+        y_np = y_var.numpy()
+        return y_np
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTest(HSigmoidTestCase(methodName="runTest"))
+    suite.addTest(
+        HSigmoidTestCase(
+            methodName="runTest",
+            batch_size=4,
+            feature_size=6,
+            num_classes=8,
+            labels=np.array([0, 1, 4, 5]).astype(np.int64),
+            path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
+                0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64),
+            path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+                1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)))
+    suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1))
+    return suite
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_row_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv.py
@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.fluid.initializer as I
+import paddle.nn.functional as F
+import unittest
+
+
+class RowConvTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 num_channels=8,
+                 time_steps=12,
+                 context_size=3,
+                 act=None,
+                 dtype="float32"):
+        super(RowConvTestCase, self).__init__(methodName=methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.time_steps = time_steps
+        self.context_size = context_size
+        self.act = act
+        self.dtype = dtype
+
+    def setUp(self):
+        input_shape = (self.batch_size, self.time_steps, self.num_channels)
+        self.input = np.random.uniform(size=input_shape).astype(self.dtype)
+        self.weight_shape = weight_shape = (self.context_size + 1,
+                                            self.num_channels)
+        self.weight = np.random.uniform(size=weight_shape).astype(self.dtype)
+
+    def fluid_layer(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data(
+                    "input", [-1, -1, self.num_channels], dtype=self.dtype)
+                y = fluid.layers.row_conv(
+                    x,
+                    self.context_size,
+                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    act=self.act)
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
+        return y_np
+
+    def functional_declarative(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data(
+                    "input", [-1, -1, self.num_channels], dtype=self.dtype)
+                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
+                y = F.row_conv(x, w, act=self.act)
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main,
+                        feed={"input": self.input,
+                              "weight": self.weight},
+                        fetch_list=[y])
+        return y_np
+
+    def functional_imperative(self, place):
+        with dg.guard(place):
+            x_var = dg.to_variable(self.input)
+            w_var = dg.to_variable(self.weight)
+            y_var = F.row_conv(x_var, w_var, act=self.act)
+            y_np = y_var.numpy()
+        return y_np
+
+    def nn_layer(self, place):
+        with dg.guard(place):
+            x_var = dg.to_variable(self.input)
+            conv = nn.RowConv(
+                self.num_channels,
+                self.context_size,
+                param_attr=I.NumpyArrayInitializer(self.weight),
+                act=self.act,
+                dtype=self.dtype)
+            y_var = conv(x_var)
+            y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.fluid_layer(place)
+        result2 = self.functional_declarative(place)
+        result3 = self.functional_imperative(place)
+        result4 = self.nn_layer(place)
+        np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+        np.testing.assert_array_almost_equal(result3, result4)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            palce = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTest(RowConvTestCase(methodName="runTest"))
+    suite.addTest(RowConvTestCase(methodName="runTest", act="sigmoid"))
+    suite.addTest(
+        RowConvTestCase(
+            methodName="runTest", context_size=5, act="sigmoid"))
+    return suite
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/nn/init.py
+++ b/python/paddle/nn/init.py
@ -80,10 +80,13 @@ from .layer.loss import BCELoss  #DEFINE_ALIAS
 # from .layer.norm import LayerNorm   #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
 # from .layer.norm import SpectralNorm   #DEFINE_ALIAS
+from .layer.activation import HSigmoid  #DEFINE_ALIAS
 # from .layer.activation import PReLU   #DEFINE_ALIAS
 from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 # from .layer.activation import Softmax   #DEFINE_ALIAS
+# from .layer.activation import LogSoftmax   #DEFINE_ALIAS
+from .layer.extension import RowConv  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell   #DEFINE_ALIAS
 # from .layer.rnn import GRUCell   #DEFINE_ALIAS
@ -184,7 +187,7 @@ from .functional.conv import conv3d_transpose  #DEFINE_ALIAS
 # from .functional.activation import hard_shrink   #DEFINE_ALIAS
 # from .functional.activation import hard_sigmoid   #DEFINE_ALIAS
 # from .functional.activation import hard_swish   #DEFINE_ALIAS
-# from .functional.activation import hsigmoid   #DEFINE_ALIAS
+from .functional.activation import hsigmoid  #DEFINE_ALIAS
 # from .functional.activation import leaky_relu   #DEFINE_ALIAS
 # from .functional.activation import logsigmoid   #DEFINE_ALIAS
 # from .functional.activation import maxout   #DEFINE_ALIAS
@ -211,7 +214,7 @@ from .functional.activation import log_softmax  #DEFINE_ALIAS
 # from .functional.extension import multiclass_nms   #DEFINE_ALIAS
 # from .functional.extension import polygon_box_transform   #DEFINE_ALIAS
 # from .functional.extension import random_crop   #DEFINE_ALIAS
-# from .functional.extension import row_conv   #DEFINE_ALIAS
+from .functional.extension import row_conv  #DEFINE_ALIAS
 # from .functional.extension import rpn_target_assign   #DEFINE_ALIAS
 # from .functional.extension import similarity_focus   #DEFINE_ALIAS
 # from .functional.extension import target_assign   #DEFINE_ALIAS
--- a/python/paddle/nn/functional/init.py
+++ b/python/paddle/nn/functional/init.py
@ -110,7 +110,7 @@ from . import activation
 # from .activation import hard_shrink   #DEFINE_ALIAS
 # from .activation import hard_sigmoid   #DEFINE_ALIAS
 # from .activation import hard_swish   #DEFINE_ALIAS
-# from .activation import hsigmoid   #DEFINE_ALIAS
+from .activation import hsigmoid  #DEFINE_ALIAS
 # from .activation import leaky_relu   #DEFINE_ALIAS
 # from .activation import logsigmoid   #DEFINE_ALIAS
 # from .activation import maxout   #DEFINE_ALIAS
@ -137,7 +137,7 @@ from .activation import log_softmax  #DEFINE_ALIAS
 # from .extension import multiclass_nms   #DEFINE_ALIAS
 # from .extension import polygon_box_transform   #DEFINE_ALIAS
 # from .extension import random_crop   #DEFINE_ALIAS
-# from .extension import row_conv   #DEFINE_ALIAS
+from .extension import row_conv  #DEFINE_ALIAS
 # from .extension import rpn_target_assign   #DEFINE_ALIAS
 # from .extension import similarity_focus   #DEFINE_ALIAS
 # from .extension import target_assign   #DEFINE_ALIAS
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@ -12,40 +12,159 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# TODO: define activation functions of neural network
+__all__ = [
+    #             'brelu',
+    #            'elu',
+    #            'erf',
+    #            'gelu',
+    #            'hard_shrink',
+    #            'hard_sigmoid',
+    #            'hard_swish',
+    'hsigmoid',
+    #            'leaky_relu',
+    #            'logsigmoid',
+    #            'maxout',
+    #            'prelu',
+    'relu',
+    #            'relu6',
+    #            'selu',
+    'sigmoid',
+    #            'soft_relu',
+    #            'softmax',
+    #            'softplus',
+    #            'softshrink',
+    #            'softsign',
+    #            'swish',
+    #            'tanh_shrink',
+    #            'thresholded_relu',
+    'log_softmax'
+]
 import warnings
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
 from ...fluid import core
 from ...fluid.data_feeder import check_variable_and_dtype

-# TODO: define activation functions of neural network  
-__all__ = [
-    # 'brelu',
-    # 'elu',
-    # 'erf',
-    # 'gelu',
-    # 'hard_shrink',
-    # 'hard_sigmoid',
-    # 'hard_swish',
-    # 'hsigmoid',
-    # 'leaky_relu',
-    # 'logsigmoid',
-    # 'maxout',
-    # 'prelu',
-    'relu',
-    # 'relu6',
-    # 'selu',
-    'sigmoid',
-    # 'soft_relu',
-    # 'softmax',
-    # 'softplus',
-    # 'softshrink',
-    # 'softsign',
-    # 'swish',
-    # 'tanh_shrink',
-    # 'thresholded_relu',
-    'log_softmax',
-]
+
+def hsigmoid(input,
+             label,
+             weight,
+             bias,
+             num_classes,
+             path_table=None,
+             path_code=None,
+             is_sparse=False):
+    """
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
+            and D is the feature size. Its data type supports float32 and float64.
+        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
+            and data type is int64.
+        weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree.
+        bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
+            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
+            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
+            nodes' weight matrix. Default: None.
+        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
+            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
+            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
+            gradient of W and input will be sparse. Default: False.
+
+    Returns:
+        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddle import fluid, nn
+            import paddle.fluid.dygraph as dg
+            import paddle.nn.functional as F
+            import numpy as np
+
+            main = fluid.Program()
+            start = fluid.Program()
+            feature_size = 6
+            num_classes = 8
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, start):
+                    x = fluid.data("input", [-1, feature_size],
+                                  dtype="float32")
+                    label = fluid.data("labels", [-1, 1], dtype="int64")
+                    w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32")
+                    b = fluid.data("bias", (num_classes -1, ), dtype="float32")
+                    y = F.hsigmoid(x, label, w, b, num_classes)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(start)
+            feed_dict = {
+                "input": np.random.randn(4, feature_size).astype(np.float32),
+                "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
+                "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32),
+                "bias": np.random.randn(num_classes - 1, ).astype(np.float32),
+            }
+            y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
+            print(y_np.shape)
+
+          # (4, 1)
+    """
+
+    attrs = {
+        "num_classes": num_classes,
+        "is_sparse": is_sparse,
+        "remote_prefetch": is_sparse
+    }
+
+    inputs = {
+        "X": input,
+        "W": weight,
+        "Bias": bias,
+        "PathTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+
+    helper = LayerHelper('hierarchical_sigmoid', **locals())
+    dtype = helper.input_dtype()
+
+    out = helper.create_variable_for_type_inference(dtype)
+    pre_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
+
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+    return out


 def relu(input, inplace=False, name=None):
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@ -13,19 +13,91 @@
 # limitations under the License.

 # TODO: define the extention functions
-# __all__ = ['add_position_encoding',
-#            'autoincreased_step_counter',
-#            'continuous_value_model',
-#            'filter_by_instag',
-#            'linear_chain_crf',
-#            'merge_selected_rows',
-#            'multiclass_nms',
-#            'polygon_box_transform',
-#            'random_crop',
-#            'row_conv',
-#            'rpn_target_assign',
-#            'similarity_focus',
-#            'target_assign',
-#            'temporal_shift',
-#            'warpctc',
-#            'diag_embed']
+__all__ = [
+    #            'add_position_encoding',
+    #            'autoincreased_step_counter',
+    #            'continuous_value_model',
+    #            'filter_by_instag',
+    #            'linear_chain_crf',
+    #            'merge_selected_rows',
+    #            'multiclass_nms',
+    #            'polygon_box_transform',
+    #            'random_crop',
+    'row_conv',
+    #            'rpn_target_assign',
+    #            'similarity_focus',
+    #            'target_assign',
+    #            'temporal_shift',
+    #            'warpctc',
+    #            'diag_embed'
+]
+
+from ...fluid import core, dygraph_utils
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.layers.layer_function_generator import templatedoc
+
+
+@templatedoc()
+def row_conv(input, weight, act=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable):  the input(X) is a LodTensor or tensor, LodTensor(X) 
+            supports variable  time-length input sequences. The underlying 
+            tensor in this LoDTensor is a matrix with shape (T, D), where 
+            T is the total time steps in this mini-batch and D is the input 
+            data dimension. 
+            If the input is a padded minibatch, the shape of the input is 
+            (N, T, D), N is batch size, T is the max time steps in the batch,
+             D is the input data dimension.
+        weight (Variable): The weight. A Tensor with shape 
+            (future_context_size + 1, D), where future_context_size is the 
+            context size of the RowConv operator.
+        act (str): Non-linear activation to be applied to output variable.
+
+    Returns:
+        ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle import fluid, nn
+            import paddle.fluid.dygraph as dg
+            import paddle.nn.functional as F
+            import numpy as np
+
+            batch_size = 4
+            time_steps = 8
+            feature_size = 6
+            context_size = 4
+            x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32)
+            weight = np.random.randn(context_size + 1, feature_size).astype(np.float32)
+
+            place = fluid.CPUPlace()
+            with dg.guard(place):
+                x_var = dg.to_variable(x)
+                w_var = dg.to_variable(weight)
+                y_var = F.row_conv(x_var, w_var)
+                y_np = y_var.numpy()
+
+            print(y_np.shape)
+
+            # (4, 8, 6)
+    """
+
+    if in_dygraph_mode():
+        pre_act = core.ops.row_conv(input, weight)
+        out = dygraph_utils._append_activation_in_dygraph(pre_act, act)
+        return out
+    else:
+        helper = LayerHelper('row_conv', **locals())
+        dtype = helper.input_dtype()
+
+        inputs = {'X': [input], 'Filter': [weight]}
+        pre_act = helper.create_variable_for_type_inference(dtype)
+        outputs = {'Out': [pre_act]}
+        helper.append_op(type='row_conv', inputs=inputs, outputs=outputs)
+        out = helper.append_activation(pre_act)
+    return out
--- a/python/paddle/nn/layer/init.py
+++ b/python/paddle/nn/layer/init.py
@ -17,9 +17,13 @@
 from . import activation
 from . import loss
 from . import conv
+from . import extension
+from . import activation
 from . import norm

 from .activation import *
 from .loss import *
 from .conv import *
+from .extension import *
+from .activation import *
 from .norm import *
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ...fluid.dygraph import layers
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
-from .. import functional
-
 # TODO: define activation functions of neural network
 __all__ = [
    # 'PReLU',
@ -24,8 +19,148 @@ __all__ = [
    'Sigmoid',
    # 'Softmax',
    'LogSoftmax',
+    'HSigmoid'
 ]

+from ...fluid.dygraph import layers
+from ...fluid import core
+from ...fluid.framework import in_dygraph_mode
+from .. import functional
+
+
+class HSigmoid(layers.Layer):
+    """
+
+    Hierarchical Sigmoid Layer.
+    
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        feature_size (int): The feature size.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
+            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
+            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
+            initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
+            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
+            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
+            set, the bias is initialized zero. Default: None.
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
+            should not be passed to its forward method. Default: False.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
+            gradient of W and input will be sparse. Default: False.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          from paddle import fluid, nn
+          import paddle.fluid.dygraph as dg
+          import paddle.nn.functional as F
+          import numpy as np
+
+          main = fluid.Program()
+          start = fluid.Program()
+          feature_size = 6
+          num_classes = 8
+          with fluid.unique_name.guard():
+              with fluid.program_guard(main, start):
+                  x = fluid.data("input", [-1, feature_size],
+                              dtype="float32")
+                  label = fluid.data("labels", [-1, 1], dtype="int64")
+                  hsm = nn.HSigmoid(feature_size, num_classes)
+                  y = hsm(x, label)
+
+          place = fluid.CPUPlace()
+          exe = fluid.Executor(place)
+          exe.run(start)
+          feed_dict = {
+              "input": np.random.randn(4, feature_size).astype(np.float32),
+              "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
+          }
+          y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
+          print(y_np.shape)
+
+          # (4, 1)
+    """
+
+    def __init__(self,
+                 feature_size,
+                 num_classes,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_custom=False,
+                 is_sparse=False,
+                 dtype="float32"):
+        super(HSigmoid, self).__init__()
+        if (num_classes < 2) and (not is_custom):
+            raise ValueError(
+                "num_classes must not be less than 2 with default tree")
+
+        if (not is_custom) and (is_sparse):
+            print("Sparse mode should not be used without custom tree")
+            is_sparse = False
+
+        self._feature_size = feature_size
+        self._num_classes = num_classes
+        self._is_custom = is_custom
+        self._is_sparse = is_sparse
+
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+
+        self._dtype = dtype
+
+        remote_prefetch = is_sparse
+        print("With sparse mode, if your models has only"
+              " small parameter prefetch may cause speed down")
+
+        C = self._num_classes if is_custom else self._num_classes - 1
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._param_attr,
+            is_bias=False,
+            dtype=self._dtype)
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
+
+    def forward(self, input, label, path_table=None, path_code=None):
+        out = functional.hsigmoid(
+            input,
+            label,
+            self.weight,
+            self.bias,
+            self._num_classes,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse)
+        return out
+

 class ReLU(layers.Layer):
    """
--- a/python/paddle/nn/layer/extension.py
+++ b/python/paddle/nn/layer/extension.py
@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["RowConv"]
+
+from ...fluid.dygraph import layers
+from .. import functional as F
+
+
+class RowConv(layers.Layer):
+    """
+    **Row-convolution operator**
+
+    The row convolution is called lookahead convolution.  This operator was 
+    introduced in the following paper for 
+    `DeepSpeech2 <http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_.
+
+    The main motivation is that a bidirectional RNN, useful in DeepSpeech like 
+    speech models, learns representation for a sequence by performing a
+    forward and a backward pass through the entire sequence. However, unlike
+    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+    and low-latency setting. The lookahead convolution incorporates information
+    from future subsequences in a computationally efficient manner to improve
+    unidirectional recurrent neural networks. The row convolution operator is
+    different from the 1D sequence convolution, and is computed as follows:
+
+    Given an input sequence X of length t and input dimension D, and a filter 
+    (W) of size context * D.
+
+    More details about row_conv please refer to the design document 
+    `<https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645>`_ .
+
+    Parameters:
+        num_channels (int): input data's feature size.
+        future_context_size (int): Future context size. Please note, the shape
+            of convolution kernel is [future_context_size + 1, D].
+        param_attr (ParamAttr): Attributes of parameters, including
+            name, initializer etc. Default: None.
+        act (str): Non-linear activation to be applied to output variable. Default: None.
+        dtype (str, optional): Data type, it can be "float32". Default: "float32".
+
+    Attributes:
+        weight (Parameter): shape [future_context_size + 1, D], the learnable 
+            weight (convolution kernel) of this layer.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          from paddle import fluid, nn
+          import paddle.fluid.dygraph as dg
+          import paddle.nn.functional as F
+          import numpy as np
+
+          batch_size = 4
+          time_steps = 8
+          feature_size = 6
+          context_size = 4
+
+          x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32)
+
+          place = fluid.CPUPlace()
+          with dg.guard(place):
+              x_var = dg.to_variable(x)
+              conv = nn.RowConv(feature_size, context_size)
+              y_var = conv(x_var)
+              y_np = y_var.numpy()
+          print(y_np.shape)
+
+          # (4, 8, 6)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 future_context_size,
+                 param_attr=None,
+                 act=None,
+                 dtype="float32"):
+        super(RowConv, self).__init__()
+        self._dtype = dtype
+        self._param_attr = param_attr
+        self._act = act
+
+        filter_shape = [future_context_size + 1, num_channels]
+        self.weight = self.create_parameter(
+            filter_shape, attr=param_attr, dtype=dtype)
+
+    def forward(self, input):
+        out = F.row_conv(input, self.weight, act=self._act)
+        return out