diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index c614602cb8..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "book"]
-	path = book
-	url = https://github.com/PaddlePaddle/book.git
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/Dockerfile
similarity index 85%
rename from paddle/scripts/docker/Dockerfile.gpu
rename to Dockerfile
index 06e53a0ef3..536adb0716 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/Dockerfile
@@ -1,26 +1,28 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 # ENV variables
 ARG BUILD_WOBOQ
 ARG BUILD_AND_INSTALL
+ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
 ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=ON
+ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 ENV HOME /root
-
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 
@@ -49,9 +51,7 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
 
-COPY . /paddle/
-RUN cd /paddle/ && git submodule update --init --recursive
-RUN /paddle/paddle/scripts/docker/build.sh
+RUN apt-get install -y swig
 
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
@@ -62,9 +62,5 @@ RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
 
-# Jupyter Notebook: Paddle book
-EXPOSE 8888
-
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
+# development image default do build work
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/book b/book
deleted file mode 160000
index 22ed2a01ae..0000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 22ed2a01aee872f055b5f5f212428f481cefc10d
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 1c29cb22a3..f74cd4ff8c 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND cd ${destination} && ln -s ./index_*.html index.html
+    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
     )
 
   set_property(
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 1575d8e9f5..446a7532c5 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -16,6 +16,14 @@ INCLUDE(ExternalProject)
 
 FIND_PACKAGE(Protobuf 3.1)
 
+IF(PROTOBUF_FOUND)
+    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
+    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+    IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
+        SET(PROTOBUF_FOUND OFF)
+    ENDIF()
+ENDIF(PROTOBUF_FOUND)
+
 IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
     SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 24ad5c815c..3640e4651f 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -71,21 +71,10 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    if(WITH_METRIC)
-        if(WITH_GPU)
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
-        else()
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric_cpu)
-        endif()
-    else()
-        set(METRIC_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
         paddle_function
-        ${METRIC_LIBS}
         ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
@@ -95,7 +84,6 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        ${METRIC_LIBS}
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CMAKE_DL_LIBS}
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
index 84125c3b4b..1ba971b368 100644
--- a/demo/introduction/api_train_v2.py
+++ b/demo/introduction/api_train_v2.py
@@ -14,7 +14,7 @@ def main():
                                 act=paddle.activation.Linear(),
                                 bias_attr=paddle.attr.Param(name='b'))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.regression_cost(input=y_predict, label=y)
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index ecafe955f9..651dfaa4b7 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -34,5 +34,5 @@ y_predict = fc_layer(
     size=1,
     act=LinearActivation(),
     bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
+cost = mse_cost(input=y_predict, label=y)
 outputs(cost)
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
index 9b254933a1..f6a061799e 100644
--- a/demo/recommendation/api_train_v2.py
+++ b/demo/recommendation/api_train_v2.py
@@ -61,7 +61,7 @@ def main():
 
     inference = paddle.layer.cos_sim(
         a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
-    cost = paddle.layer.regression_cost(
+    cost = paddle.layer.mse_cost(
         input=inference,
         label=paddle.layer.data(
             name='score', type=paddle.data_type.dense_vector(1)))
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index aabcd33525..25f529d7d7 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -86,10 +86,7 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(
-        regression_cost(
-            input=similarity, label=data_layer(
-                'rating', size=1)))
+    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
 
     define_py_data_sources2(
         'data/train.list',
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index fca981221e..9be0b370ee 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -1,26 +1,9 @@
 API
 ===
 
-模型配置 API
-------------
-
 ..  toctree::
     :maxdepth: 1
 
-    v2/model_configs.rst
-
-数据 API
---------
-
-..  toctree::
-    :maxdepth: 1
-
-    v2/data.rst
-
-训练 API
---------
-
-..	toctree::
-	:maxdepth: 1
-
-	v2/run_logic.rst
\ No newline at end of file
+    模型配置 <v2/model_configs.rst>
+    数据访问 <v2/data.rst>
+    训练与应用 <v2/run_logic.rst>
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index f0ad0fb2ae..25c1dd00b9 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -1,26 +1,9 @@
 API
 ===
 
-Model Config API
-----------------
-
 ..  toctree::
     :maxdepth: 1
 
     v2/model_configs.rst
-
-Data API
---------
-
-..  toctree::
-    :maxdepth: 1
-
     v2/data.rst
-
-Train API
----------
-
-..	toctree::
-	:maxdepth: 1
-
-	v2/run_logic.rst
\ No newline at end of file
+    v2/run_logic.rst
diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index bbea823de4..24389c2d85 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -432,6 +432,12 @@ multi_binary_label_cross_entropy
     :members: multi_binary_label_cross_entropy
     :noindex:
 
+mse_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: mse_cost
+    :noindex:
+
 huber_cost
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -450,6 +456,12 @@ rank_cost
     :members: rank_cost
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -486,12 +498,6 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
-sum_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_cost
-    :noindex:
-
 Check Layer 
 ============
 
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
new file mode 100644
index 0000000000..eca3ce03bc
--- /dev/null
+++ b/doc/api/v2/config/activation.rst
@@ -0,0 +1,101 @@
+===========
+Activation
+===========
+
+Abs
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Abs
+    :noindex:
+    
+Exp
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Exp
+    :noindex:
+    
+Identity
+========
+
+..  automodule:: paddle.v2.activation
+    :members: Identity
+    :noindex:
+    
+Linear
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Linear
+    :noindex:
+
+Log
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Log
+    :noindex:
+    
+Square
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Square
+    :noindex:
+    
+Sigmoid
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Sigmoid
+    :noindex:
+    
+Softmax
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Softmax
+    :noindex:
+    
+SequenceSoftmax
+===============
+
+..  automodule:: paddle.v2.activation
+    :members: SequenceSoftmax
+    :noindex:
+    
+Relu
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Relu
+    :noindex:
+    
+BRelu
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: BRelu
+    :noindex:
+    
+SoftRelu
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftRelu
+    :noindex:
+    
+Tanh
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Tanh
+    :noindex:
+    
+STanh
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: STanh
+    :noindex:
diff --git a/doc/api/v2/config/attr.rst b/doc/api/v2/config/attr.rst
new file mode 100644
index 0000000000..a93f41b867
--- /dev/null
+++ b/doc/api/v2/config/attr.rst
@@ -0,0 +1,6 @@
+Parameter Attribute
+===================
+
+..  automodule:: paddle.v2.attr
+    :members:
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
new file mode 100644
index 0000000000..db33a20487
--- /dev/null
+++ b/doc/api/v2/config/layer.rst
@@ -0,0 +1,487 @@
+..  _api_v2.layer:
+
+======
+Layers
+======
+
+Data layer
+===========
+
+..  _api_v2.layer_data:
+
+data
+----
+..  automodule:: paddle.v2.layer
+    :members: data
+    :noindex:
+
+Fully Connected Layers
+======================
+
+..  _api_v2.layer_fc:
+
+fc
+--
+..  automodule:: paddle.v2.layer
+    :members: fc
+    :noindex:
+
+selective_fc
+------------
+..  automodule:: paddle.v2.layer
+    :members: selective_fc
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  automodule:: paddle.v2.layer
+    :members: conv_operator
+    :noindex:
+
+conv_projection
+---------------
+..  automodule:: paddle.v2.layer
+    :members: conv_projection
+    :noindex:
+
+conv_shift
+----------
+..  automodule:: paddle.v2.layer
+    :members: conv_shift
+    :noindex:
+
+img_conv
+--------
+..  automodule:: paddle.v2.layer
+    :members: img_conv
+    :noindex:
+
+..  _api_v2.layer_context_projection:
+
+context_projection 
+------------------
+..  automodule:: paddle.v2.layer
+    :members: context_projection
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool
+--------
+..  automodule:: paddle.v2.layer
+    :members: img_pool
+    :noindex:   
+
+spp
+---
+..  automodule:: paddle.v2.layer
+    :members: spp
+    :noindex:
+
+maxout
+------
+..  automodule:: paddle.v2.layer
+    :members: maxout
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm
+-----------
+..  automodule:: paddle.v2.layer
+    :members: img_cmrnorm
+    :noindex:
+
+batch_norm
+----------
+..  automodule:: paddle.v2.layer
+    :members: batch_norm
+    :noindex:
+
+sum_to_one_norm
+---------------
+..  automodule:: paddle.v2.layer
+    :members: sum_to_one_norm
+    :noindex:
+    
+Recurrent Layers
+================
+
+recurrent
+---------
+..  automodule:: paddle.v2.layer
+    :members: recurrent
+    :noindex:
+
+lstmemory
+---------
+..  automodule:: paddle.v2.layer
+    :members: lstmemory
+    :noindex:
+
+grumemory
+---------
+..  automodule:: paddle.v2.layer
+    :members: grumemory
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+memory
+------
+..  automodule:: paddle.v2.layer
+    :members: memory
+    :noindex:
+
+recurrent_group
+---------------
+..  automodule:: paddle.v2.layer
+    :members: recurrent_group
+    :noindex:
+    
+lstm_step
+---------
+..  automodule:: paddle.v2.layer
+    :members: lstm_step
+    :noindex:
+
+gru_step
+--------
+..  automodule:: paddle.v2.layer
+    :members: gru_step
+    :noindex:
+
+beam_search
+------------
+..  automodule:: paddle.v2.layer
+    :members: beam_search
+    :noindex:
+    
+get_output
+----------
+..  automodule:: paddle.v2.layer
+    :members: get_output
+    :noindex:
+    
+Mixed Layer
+===========
+
+..  _api_v2.layer_mixed:
+
+mixed
+-----
+..  automodule:: paddle.v2.layer
+    :members: mixed
+    :noindex:
+
+..  _api_v2.layer_embedding:
+
+embedding
+---------
+..  automodule:: paddle.v2.layer
+    :members: embedding
+    :noindex:
+
+scaling_projection
+------------------
+..  automodule:: paddle.v2.layer
+    :members: scaling_projection
+    :noindex:
+
+dotmul_projection
+-----------------
+..  automodule:: paddle.v2.layer
+    :members: dotmul_projection
+    :noindex:
+
+dotmul_operator
+---------------
+..  automodule:: paddle.v2.layer
+    :members: dotmul_operator
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  automodule:: paddle.v2.layer
+    :members: full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  automodule:: paddle.v2.layer
+    :members: identity_projection
+    :noindex:
+
+
+table_projection
+----------------
+..  automodule:: paddle.v2.layer
+    :members: table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  automodule:: paddle.v2.layer
+    :members: trans_full_matrix_projection
+    :noindex:
+    
+Aggregate Layers
+================
+
+..  _api_v2.layer_pooling:
+
+pooling
+-------
+..  automodule:: paddle.v2.layer
+    :members: pooling
+    :noindex:
+
+..  _api_v2.layer_last_seq:
+
+last_seq
+--------
+..  automodule:: paddle.v2.layer
+    :members: last_seq
+    :noindex:
+
+..  _api_v2.layer_first_seq:
+
+first_seq
+---------
+..  automodule:: paddle.v2.layer
+    :members: first_seq
+    :noindex:
+
+concat
+------
+..  automodule:: paddle.v2.layer
+    :members: concat
+    :noindex:
+
+seq_concat
+----------
+..  automodule:: paddle.v2.layer
+    :members: seq_concat
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand
+------------
+..  automodule:: paddle.v2.layer
+    :members: block_expand
+    :noindex:
+
+..  _api_v2.layer_expand:
+
+expand
+------
+..  automodule:: paddle.v2.layer
+    :members: expand
+    :noindex:
+
+repeat
+------
+..  automodule:: paddle.v2.layer
+    :members: repeat
+    :noindex:
+
+rotate
+------
+..  automodule:: paddle.v2.layer
+    :members: rotate
+    :noindex:
+
+seq_reshape
+-----------
+..  automodule:: paddle.v2.layer
+    :members: seq_reshape
+    :noindex:
+
+Math Layers
+===========
+
+addto
+-----
+..  automodule:: paddle.v2.layer
+    :members: addto
+    :noindex:
+
+linear_comb
+-----------
+..  automodule:: paddle.v2.layer
+    :members: linear_comb
+    :noindex:
+
+interpolation
+-------------
+..  automodule:: paddle.v2.layer
+    :members: interpolation
+    :noindex:
+
+bilinear_interp
+---------------
+..  automodule:: paddle.v2.layer
+    :members: bilinear_interp
+    :noindex:
+
+power
+-----
+..  automodule:: paddle.v2.layer
+    :members: power
+    :noindex:
+
+scaling
+-------
+..  automodule:: paddle.v2.layer
+    :members: scaling
+    :noindex:
+
+slope_intercept
+---------------
+..  automodule:: paddle.v2.layer
+    :members: slope_intercept
+    :noindex:
+
+tensor
+------
+..  automodule:: paddle.v2.layer
+    :members: tensor
+    :noindex:
+
+..  _api_v2.layer_cos_sim:
+
+cos_sim
+-------
+..  automodule:: paddle.v2.layer
+    :members: cos_sim
+    :noindex:
+
+trans
+-----
+..  automodule:: paddle.v2.layer
+    :members: trans
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid
+-----
+..  automodule:: paddle.v2.layer
+    :members: maxid
+    :noindex:
+
+sampling_id
+-----------
+..  automodule:: paddle.v2.layer
+    :members: sampling_id
+    :noindex:
+
+Slicing and Joining Layers
+==========================
+
+pad
+----
+..  automodule:: paddle.v2.layer
+    :members: pad
+    :noindex:
+
+..  _api_v2.layer_costs:
+
+Cost Layers
+===========
+
+cross_entropy_cost
+------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_entropy_cost
+    :noindex:
+
+cross_entropy_with_selfnorm_cost
+--------------------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_entropy_with_selfnorm_cost
+    :noindex:
+
+multi_binary_label_cross_entropy_cost
+-------------------------------------
+..  automodule:: paddle.v2.layer
+    :members: multi_binary_label_cross_entropy_cost
+    :noindex:
+
+huber_cost
+----------
+..  automodule:: paddle.v2.layer
+    :members: huber_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  automodule:: paddle.v2.layer
+    :members: lambda_cost
+    :noindex:
+
+rank_cost
+---------
+..  automodule:: paddle.v2.layer
+    :members: rank_cost
+    :noindex:
+
+sum_cost
+---------
+..  automodule:: paddle.v2.layer
+    :members: sum_cost
+    :noindex:
+
+crf
+---
+..  automodule:: paddle.v2.layer
+    :members: crf
+    :noindex:
+
+crf_decoding
+------------
+..  automodule:: paddle.v2.layer
+    :members: crf_decoding
+    :noindex:
+
+ctc
+---
+..  automodule:: paddle.v2.layer
+    :members: ctc
+    :noindex:
+
+warp_ctc
+--------
+..  automodule:: paddle.v2.layer
+    :members: warp_ctc
+    :noindex:
+
+nce
+---
+..  automodule:: paddle.v2.layer
+    :members: nce
+    :noindex:
+
+hsigmoid
+---------
+..  automodule:: paddle.v2.layer
+    :members: hsigmoid
+    :noindex:
+
+Check Layer 
+============
+
+eos
+---
+..  automodule:: paddle.v2.layer
+    :members: eos
+    :noindex:
diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
new file mode 100644
index 0000000000..6f209bc95b
--- /dev/null
+++ b/doc/api/v2/config/networks.rst
@@ -0,0 +1,117 @@
+========
+Networks
+========
+
+The v2.networks module contains pieces of neural network that combine multiple layers.
+
+NLP
+===
+
+sequence_conv_pool
+------------------
+..  automodule:: paddle.v2.networks
+    :members: sequence_conv_pool
+    :noindex:
+
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
+text_conv_pool
+--------------
+..  automodule:: paddle.v2.networks
+    :members: text_conv_pool
+    :noindex:
+
+Images
+======
+
+img_conv_bn_pool
+----------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_bn_pool
+    :noindex:
+
+img_conv_group
+--------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_group
+    :noindex:
+
+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+..  automodule:: paddle.v2.networks
+    :members: simple_img_conv_pool
+    :noindex:
+
+vgg_16_network
+---------------
+..  automodule:: paddle.v2.networks
+    :members: vgg_16_network
+    :noindex:
+
+Recurrent
+=========
+
+LSTM
+----
+
+lstmemory_unit
+``````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_unit
+    :noindex:
+
+lstmemory_group
+```````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_group
+    :noindex:
+
+simple_lstm
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
+gru_unit
+````````
+..  automodule:: paddle.v2.networks
+    :members: gru_unit
+    :noindex:
+
+gru_group
+`````````
+..  automodule:: paddle.v2.networks
+    :members: gru_group
+    :noindex:
+
+simple_gru
+``````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru
+    :noindex:
+
+simple_attention
+----------------
+..  automodule:: paddle.v2.networks
+    :members: simple_attention
+    :noindex:
+
+Miscs
+=====
+
+dropout_layer
+--------------
+..  automodule:: paddle.v2.networks
+    :members: dropout_layer
+    :noindex:
diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst
new file mode 100644
index 0000000000..ec6ba0aa46
--- /dev/null
+++ b/doc/api/v2/config/optimizer.rst
@@ -0,0 +1,47 @@
+..  _api_v2.optimizer:
+
+==========
+Optimizer
+==========
+
+Momentum
+========
+..  automodule:: paddle.v2.optimizer
+    :members: Momentum
+    :noindex:
+
+Adam
+====
+..  automodule:: paddle.v2.optimizer
+    :members: Adam
+    :noindex:
+
+Adamax
+======
+..  automodule:: paddle.v2.optimizer
+    :members: Adamax
+    :noindex:
+
+AdaGrad
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: AdaGrad
+    :noindex:
+
+DecayedAdaGrad
+==============
+..  automodule:: paddle.v2.optimizer
+    :members: DecayedAdaGrad
+    :noindex:
+
+AdaDelta
+========
+..  automodule:: paddle.v2.optimizer
+    :members: AdaDelta
+    :noindex:
+
+RMSProp
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: RMSProp
+    :noindex:
diff --git a/doc/api/v2/config/pooling.rst b/doc/api/v2/config/pooling.rst
new file mode 100644
index 0000000000..d26b365c92
--- /dev/null
+++ b/doc/api/v2/config/pooling.rst
@@ -0,0 +1,46 @@
+=======
+Pooling
+=======
+
+BasePool
+========
+..  automodule:: paddle.v2.pooling
+    :members: BasePool
+    :noindex:
+
+Avg
+===
+..  automodule:: paddle.v2.pooling
+    :members: Avg
+    :noindex:
+
+Max
+===
+..  automodule:: paddle.v2.pooling
+    :members: Max
+    :noindex:
+
+Sum
+===
+..  automodule:: paddle.v2.pooling
+    :members: Sum
+    :noindex:
+
+SquareRootN
+===========
+..  automodule:: paddle.v2.pooling
+    :members: SquareRootN
+    :noindex:
+
+CudnnAvg
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnAvg
+    :noindex:
+
+CudnnMax
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnMax
+    :noindex:
+
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index 1c0a202a8c..b042320bc2 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -1,52 +1,53 @@
-================
-Data Related API
-================
+========
+Datasets
+========
 
 
-#########
 DataTypes
-#########
+=========
 
 ..  automodule:: paddle.v2.data_type
     :members:
+    :noindex:
 
-##########
 DataFeeder
-##########
+==========
 
 ..  automodule:: paddle.v2.data_feeder
     :members:
+    :noindex:
 
-######
 Reader
-######
+======
 
 ..  automodule:: paddle.v2.reader
     :members:
+    :noindex:
 
 ..  automodule:: paddle.v2.reader.creator
     :members:
+    :noindex:
 
-#########
 minibatch
-#########
+=========
 
 ..  automodule:: paddle.v2.minibatch
     :members:
+    :noindex:
 
-#######
 Dataset
-#######
+=======
 
 ..  automodule:: paddle.v2.dataset
     :members:
-
+    :noindex:
 
 mnist
 +++++
 
 ..  automodule:: paddle.v2.dataset.mnist
     :members:
+    :noindex:
 
 
 cifar
@@ -54,40 +55,54 @@ cifar
 
 ..  automodule:: paddle.v2.dataset.cifar
     :members:
+    :noindex:
 
 conll05
 +++++++
 
 ..  automodule:: paddle.v2.dataset.conll05
     :members:
+    :noindex:
 
 imdb
 ++++
 
 ..  automodule:: paddle.v2.dataset.imdb
     :members:
+    :noindex:
 
 imikolov
 ++++++++
 
 ..  automodule:: paddle.v2.dataset.imikolov
     :members:
+    :noindex:
 
 movielens
 +++++++++
 
 ..  automodule:: paddle.v2.dataset.movielens
     :members:
+    :noindex:
 
 sentiment
 +++++++++
 
 ..  automodule:: paddle.v2.dataset.sentiment
     :members:
+    :noindex:
 
 uci_housing
 +++++++++++
 
 ..  automodule:: paddle.v2.dataset.uci_housing
     :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
 
diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst
index e9cd3d5bf7..a5fae7e29e 100644
--- a/doc/api/v2/model_configs.rst
+++ b/doc/api/v2/model_configs.rst
@@ -1,46 +1,12 @@
-#########################
-Configuration Related API
-#########################
-
-======
-Layers
-======
-
-..  automodule:: paddle.v2.layer
-    :members:
-
-
-==========
-Attributes
-==========
-
-..	automodule:: paddle.v2.attr
-	:members:
-
-===========
-Activations
-===========
-
-..	automodule:: paddle.v2.activation
-	:members:
-
-========
-Poolings
-========
-
-..	automodule:: paddle.v2.pooling
-	:members:
-
-========
-Networks
-========
-
-..	automodule:: paddle.v2.networks
-	:members:
-
-==========
-Optimizers
-==========
-
-..	automodule:: paddle.v2.optimizer
-	:members:
+Model Configuration
+===================
+
+..  toctree::
+    :maxdepth: 1
+
+    config/activation.rst
+    config/layer.rst
+    config/optimizer.rst
+    config/pooling.rst
+    config/networks.rst
+    config/attr.rst
diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst
index 0f807873ff..94921e1a7b 100644
--- a/doc/api/v2/run_logic.rst
+++ b/doc/api/v2/run_logic.rst
@@ -1,34 +1,27 @@
-###########
-Trainer API
-###########
+======================
+Training and Inference
+======================
 
-
-==========
 Parameters
 ==========
 
 ..  automodule:: paddle.v2.parameters
-    :members:
+    :noindex:
 
-
-=======
 Trainer
 =======
 
-..	automodule:: paddle.v2.trainer
-	:members:
+..  automodule:: paddle.v2.trainer
+    :noindex:
 
-
-=====
 Event
 =====
 
-..	automodule:: paddle.v2.event
-	:members:
+..  automodule:: paddle.v2.event
+    :noindex:
 
-
-=========
 Inference
 =========
 
-..	autofunction:: paddle.v2.infer
\ No newline at end of file
+..  autofunction:: paddle.v2.infer
+    :noindex:
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/why_plain_c.md
new file mode 100644
index 0000000000..a3f41ca7b9
--- /dev/null
+++ b/doc/design/multi_language_interface/why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t* width,
+                                 uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/math/matrix.hpp"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::math::matrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/math/matrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class Matrix {
+  //...
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 简单实现
+
+TBD
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 6d5367177d..df5e172252 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,3 +286,16 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+12. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+主要的解决办法是减小学习律或者对数据进行归一化处理。
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index d01cdaaeb7..428f58830e 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = regression_cost(input= ȳ, label=y)
+    cost = mse_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index c10b897d42..6775da20c2 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = regression_cost(input=y_predict, label=y)
+        cost = mse_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 78f518cfe4..af889ec9d1 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,6 +4,86 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
+纯CPU和GPU的docker镜像使用说明
+------------------------------
+
+对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
+我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
+`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
+
+以交互容器方式运行纯CPU的镜像：
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+
+或者，可以以后台进程方式运行容器：
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+
+然后用密码 :code:`root` SSH进入容器：
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+
+
+以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+
+
+运行PaddlePaddle书籍
+---------------------
+
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+
+当您进入容器内之后，只用运行以下命令：
+
+.. code-block:: bash
+        
+    jupyter notebook
+
+然后在浏览器中输入以下网址：
+    
+.. code-block:: text
+
+    http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+
+非AVX镜像
+---------
+
+纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
+
+.. code-block:: bash
+
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
 通过Docker容器开发PaddlePaddle
 ------------------------------
 
@@ -57,67 +137,6 @@ PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Do
       ctest
 
 
-纯CPU和GPU的docker镜像
-----------------------
-
-对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动运行以下两个命令：
-
-.. code-block:: bash
-
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
-
-以交互容器方式运行纯CPU的镜像：
-
-.. code-block:: bash
-
-    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
-
-或者，可以以后台进程方式运行容器：
-
-.. code-block:: bash
-
-    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
-
-然后用密码 :code:`root` SSH进入容器：
-
-.. code-block:: bash
-
-    ssh -p 2202 root@localhost
-
-SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-
-以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
-
-.. code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
-
-
-非AVX镜像
----------
-
-纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-
-.. code-block:: bash
-
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
-
-.. code-block:: bash
-
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
-
-
 文档
 ----
 
@@ -128,7 +147,7 @@ Paddle的Docker镜像带有一个通过 `woboq code browser
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index a92201c618..606746597a 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,100 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.
 
 
+Usage of CPU-only and GPU Images
+----------------------------------
+
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
+and `paddledev/paddle:0.10.0rc1-gpu`.
+
+To run the CPU-only image as an interactive container:
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+
+or, we can run it as a daemon container
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+
+and SSH to this container using password :code:`root`:
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.
+
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+
+
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+Once you are inside the container, simply issue the command:
+
+.. code-block:: bash
+        
+    jupyter notebook
+
+Then, you would back and paste the address into the local browser:
+    
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+
+Non-AVX Images
+--------------
+
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+If it doesn't, we will need to build non-AVX images manually from
+source code:
+
+.. code-block:: bash
+
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
 Development Using Docker
 ------------------------
 
@@ -82,103 +176,6 @@ Windows -- in a consistent way.
       cd /paddle/build
       ctest
 
-4. Run PaddlePaddle Book under Docker Container
-
-    The Jupyter Notebook is an open-source web application that allows
-    you to create and share documents that contain live code, equations,
-    visualizations and explanatory text in a single browser.
-
-    PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
-    We already exposed port 8888 for this book. If you want to
-    dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
-
-    Once you are inside the container, simply issue the command:
-
-    .. code-block:: bash
-
-       jupyter notebook
-
-    Then, you would back and paste the address into the local browser:
-
-    .. code-block:: text
-
-       http://localhost:8888/
-
-    That's all. Enjoy your journey!
-
-CPU-only and GPU Images
------------------------
-
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically runs the following commands:
-
-.. code-block:: bash
-
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
-
-
-To run the CPU-only image as an interactive container:
-
-.. code-block:: bash
-
-    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
-
-or, we can run it as a daemon container
-
-.. code-block:: bash
-
-    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
-
-and SSH to this container using password :code:`root`:
-
-.. code-block:: bash
-
-    ssh -p 2202 root@localhost
-
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
-
-
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
-
-.. code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
-
-
-Non-AVX Images
---------------
-
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
-
-.. code-block:: bash
-
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-
-If it doesn't, we will need to build non-AVX images manually from
-source code:
-
-.. code-block:: bash
-
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
-
 
 Documentation
 -------------
@@ -194,7 +191,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index 2e2a2fcc54..f7aa525054 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -228,16 +228,6 @@
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index e5546f0ddc..d1963067bd 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -228,16 +228,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">metric learning</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
index 3b573a324d..b4625ba68c 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -180,15 +180,6 @@
   - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
   - 类型: string (默认: "", null).
 
-## 度量学习(Metric Learning)
-* `--external`
-   - 指示是否使用外部机器进行度量学习.
-   - 类型: bool (默认: 0).
-
-* `--data_server_port`
-  - 数据服务器(data server)的监听端口，主要用在度量学习中.
-  - 类型: int32 (默认: 21134).
-
 ## 数据支持(DataProvider)
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 33b7ec0d51..b681ebc81a 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -184,15 +184,6 @@
   - Specify shared dynamic library. It can be defined out of paddle by user.
   - type: string (default: "", null).
 
-## Metric Learning
-* `--external`
-   - Whether to use external machine for metric learning.
-   - type: bool (default: 0).
-
-* `--data_server_port`
-  - Listening port for dserver (data server), dserver is mainly used in metric learning.
-  - type: int32 (default: 21134).
-
 ## DataProvider
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 2a7a6c8c17..3121b3f59d 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -213,7 +213,7 @@ I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
diff --git a/doc_theme/static/css/override.css b/doc_theme/static/css/override.css
index 438a87848a..09ecff688b 100644
--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
@@ -1,3 +1,6 @@
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
 body {
     padding-top: 80px;
     background-image: none !important;
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 4654d02064..6ae60102b3 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -24,9 +24,6 @@ limitations under the License. */
 DEFINE_bool(allow_only_one_model_on_one_gpu,
             true,
             "If true, do not allow multiple models on one GPU device");
-#ifdef PADDLE_METRIC_LEARNING
-DECLARE_bool(external);
-#endif
 
 namespace paddle {
 
@@ -45,11 +42,7 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
       inArgsCopied_(false) {
-#ifdef PADDLE_METRIC_LEARNING
-  isPassGrad_ = FLAGS_external;
-#else
   isPassGrad_ = false;
-#endif
   numThreads_ = FLAGS_trainer_count;
   if (useGpu) {
     //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index fdb46aba68..191176ce98 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
     return false;
   }
   crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
+      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
   return true;
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index 02b7aaf17e..0b54442009 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
   CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
 
   parameter_ = parameters_[0];
+  weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
 
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_,
-                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
-                         parameter_->getBuf(PARAMETER_GRADIENT)
-                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                             : nullptr);
+      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
     }
     output_.value->getData()[i] =
         crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) {
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
+  bool needWGrad = weight_->getWGrad() ? true : false;
   for (int i = 0; i < numSequences; ++i) {
     crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      output.grad->getData() + numClasses_ * starts[i],
                       label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i]);
-    if (weightLayer_) {
-      real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->mulScalar(weight);
+                      starts[i + 1] - starts[i],
+                      needWGrad);
+    real instanceWeight = weightLayer_
+                              ? getInputValue(*weightLayer_)->getElement(i, 0)
+                              : real(1.0f);
+    instanceWeight *= coeff_;
+
+    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (needWGrad) {
+      weight_->getWGrad()->add(
+          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
     }
   }
 
-  if (coeff_ != real(1.0f)) {
-    output.grad->mulScalar(coeff_);
-  }
-
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index de36a85083..00ec13cede 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -38,8 +38,9 @@ protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;            // weight for the layer
+  LayerPtr weightLayer_;            // weight for each sequence
+  std::unique_ptr<Weight> weight_;  // parameters
+  real coeff_;                      // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index f76d41ad3e..125aaf947f 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -381,8 +381,7 @@ void Layer::backwardActivation() {
 void Layer::forwardDropOut() {
   auto& outV = getOutputValue();
 
-  if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
-      passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
+  if (passType_ == PASS_TRAIN) {
     // new dropOutMask_ if dropOutMask_ is null ptr
     Matrix::resizeOrCreate(dropOutMask_,
                            outV->getHeight(),
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index b7f748f3bb..dc3dc15679 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -17,18 +17,12 @@ limitations under the License. */
 
 namespace paddle {
 
-LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
+LinearChainCRF::LinearChainCRF(int numClasses, real* para)
     : numClasses_(numClasses) {
   a_ = Matrix::create(para, 1, numClasses_);
   b_ = Matrix::create(para + numClasses_, 1, numClasses_);
   w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
 
-  if (grad) {
-    da_ = Matrix::create(grad, 1, numClasses_);
-    db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
-    dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
-  }
-
   ones_ = Matrix::create(1, numClasses_);
   ones_->one();
 
@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   return -ll;
 }
 
-void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
+void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
   MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
-  MatrixPtr matGrad = Matrix::create(length, numClasses_);
+  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
   Matrix::resizeOrCreate(beta_, length, numClasses_);
   real* b = b_->getData();
-  real* dw = dw_ ? dw_->getData() : nullptr;
+  if (needWGrad) {
+    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
+    matWGrad_->zeroMem();
+    da_ = matWGrad_->subRowMatrix(0, 1);
+    db_ = matWGrad_->subRowMatrix(1, 2);
+    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
+  }
 
   real* alpha = alpha_->getData();
   real* beta = beta_->getData();
   real* expW = expW_->getData();
   real* expX = expX_->getData();
-  real* grad = matGrad->getData();
+  real* grad = matGrad_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
     beta[(length - 1) * numClasses_ + i] = exp(b[i]);
@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
     normalizeL1(beta + k * numClasses_, numClasses_);
   }
 
-  matGrad->dotMul(*alpha_, *beta_);
-  matGrad->rowNormalizeL1(*matGrad);
+  matGrad_->dotMul(*alpha_, *beta_);
+  matGrad_->rowNormalizeL1(*matGrad_);
   for (int k = 0; k < length; ++k) {
     grad[k * numClasses_ + s[k]] -= (real)1;
   }
-  matDX->add(*matGrad);
-  if (da_) {
-    da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-  }
-  if (db_) {
-    db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
-  }
 
-  beta_->dotMul(*beta_, *expX_);
-  beta_->rowNormalizeL1(*beta_);
+  if (needWGrad) {
+    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
+    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
 
-  for (int k = 1; dw && k < length; ++k) {
-    real sum = 0;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-               beta[k * numClasses_ + j];
+    beta_->dotMul(*beta_, *expX_);
+    beta_->rowNormalizeL1(*beta_);
+
+    real* dw = dw_->getData();
+    for (int k = 1; k < length; ++k) {
+      real sum = 0;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
+                 beta[k * numClasses_ + j];
+        }
       }
-    }
-    sum = 1 / sum;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                   alpha[(k - 1) * numClasses_ + i] *
-                                   beta[k * numClasses_ + j];
+      sum = 1 / sum;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
+                                     alpha[(k - 1) * numClasses_ + i] *
+                                     beta[k * numClasses_ + j];
+        }
       }
+      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
     }
-    dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
   }
 }
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index a905bf803d..8daf1e14a6 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,7 +21,7 @@ namespace paddle {
 class LinearChainCRF {
 public:
   /**
-   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
    * The next numClasses values of para are for ending weights (\f$b\f$),
    * The remaning values are for transition weights (\f$w\f$).
@@ -34,7 +34,7 @@ public:
    * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
-  LinearChainCRF(int numClasses, real* para, real* grad);
+  LinearChainCRF(int numClasses, real* para);
 
   /**
    * Calculate the negative log likelihood of s given x.
@@ -45,29 +45,45 @@ public:
 
   /**
    * Calculate the gradient with respect to x, a, b, and w.
-   * The gradient of x will be stored in dx.
    * backward() can only be called after a corresponding call to forward() with
    * the same x, s and length.
-   * @note The gradient is added to dx and grad (provided at constructor).
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
    */
-  void backward(real* x, real* dx, int* s, int length);
+  void backward(real* x, int* s, int length, bool needWGrad);
 
   /**
    * Find the most probable sequence given x. The result will be stored in s.
    */
   void decode(real* x, int* s, int length);
 
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
 protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
   MatrixPtr w_;
+  MatrixPtr matWGrad_;
   MatrixPtr da_;
   MatrixPtr db_;
   MatrixPtr dw_;
   MatrixPtr ones_;
 
   MatrixPtr expX_;
+  MatrixPtr matGrad_;
   MatrixPtr alpha_;
   MatrixPtr beta_;
   MatrixPtr maxX_;
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0caa5e1e11..3c4128b5b8 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+################ test_CRFLayerGrad ####################
+add_unittest_without_exec(test_CRFLayerGrad
+    test_CRFLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_CRFLayerGrad
+    COMMAND test_CRFLayerGrad)
+
+
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
     LayerGradUtil.cpp)
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000..df14449291
--- /dev/null
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/LinearChainCRF.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 14d9db5247..ceb69359c9 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) {
   }
 }
 
-TEST(Layer, CRFLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "crf",
-                100,
-                /* trans */ false,
-                /* useGpu */ false,
-                false /*useWeight*/,
-                0.03 /*epsilon*/);
-}
-
 TEST(Layer, CTCLayer) {
   TestConfig config;
   config.layerConfig.set_type("ctc");
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f046cb0b28..b37277054c 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
   real* a = para.getData();
   real* b = para.getData() + numClasses;
   real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData(), nullptr);
+  LinearChainCRF crf(4, para.getData());
   for (int length : {1, 2, 3, 10}) {
     for (int tries = 0; tries < 10; ++tries) {
       CpuMatrix x(length, numClasses);
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 11d7a147bf..667bc451d1 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -30,9 +30,6 @@ namespace paddle {
  * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
  * recvJobQueue_. the second solution use some shared thread pool to manage
  * connections.
- * In addition to pserver, metric learning also uses network to exchange
- * features within multi-machines, so this class just abstracts some basic
- * threads and queue buffer creation for them
  */
 class BaseClient {
 protected:
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 856fa0ad1a..877cbb86ec 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -367,11 +367,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-/// forwardbackward delta from all trainers
-/// indicate the fluctuation caused by forwardbackward.
-#ifndef PADDLE_METRIC_LEARNING
-  // @TODO(yanfei):
-  // add support tuning forwardbackward balance for metric learning
+  // forwardbackward delta from all trainers
+  // indicate the fluctuation caused by forwardbackward.
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
         *statSet_,
@@ -381,7 +378,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
         request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
-#endif
 
   {
     /// approximately pure network overhead
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index c009b05cde..6d6a406cf6 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -16,11 +16,25 @@ import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
 import numpy
+import itertools
 
 __all__ = ['DataProviderConverter']
 
 
 class IScanner(object):
+    """
+    The scanner will scan Python object two passes, then convert it to Paddle's
+    argument.
+
+    In the first pass, `pre_scan` will be invoked by every data instance, and
+    then invoke `finish_pre_scan` to arguments. And the second pass do the same
+    thing except the functions changed to `scan`, `finish_scan`.
+
+    During the first pass, a scanner may count the shape of input matrix and
+    allocate memory for this argument. Then fill the data into this  argument
+    in second pass.
+    """
+
     def __init__(self, input_type, pos):
         self.input_type = input_type
         if not isinstance(self.input_type, dp2.InputType):
@@ -36,10 +50,40 @@ class IScanner(object):
         self.data_in_gpu = swig_paddle.isUsingGpu(
         ) and swig_paddle.getTrainerCount() == 1
 
+    def pre_scan(self, dat):
+        """
+        First pass scan method. During this method, the scanner could count the
+        data number, and get the total memory size this batch would use.
+
+        :param dat: The python object.
+        """
+        pass
+
+    def finish_pre_scan(self, argument):
+        """
+        Finish first scan pass. Allocate the memory.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        :return:
+        """
+        pass
+
     def scan(self, dat):
+        """
+        Second pass scan method. Copy the data to arguments.
+
+        :param dat: The python object.
+        """
         pass
 
     def finish_scan(self, argument):
+        """
+        Finish second pass. Finalize the resources, etc.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        """
         pass
 
 
@@ -51,12 +95,19 @@ class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__height__ = 0
+
+    def pre_scan(self, dat):
+        self.__height__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__mat__ = numpy.ndarray(
+            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+        self.__height__ = 0
 
     def scan(self, dat):
-        if self.__mat__ is None:
-            self.__mat__ = numpy.array([dat], dtype='float32')
-        else:
-            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
+        self.__mat__[self.__height__] = dat
+        self.__height__ += 1
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
@@ -163,7 +214,14 @@ class DataProviderConverter(object):
         ]
 
         for each_sample in dat:
-            for each_step, scanner in zip(each_sample, scanners):
+            for each_step, scanner in itertools.izip(each_sample, scanners):
+                scanner.pre_scan(each_step)
+
+        for scanner in scanners:
+            scanner.finish_pre_scan(argument)
+
+        for each_sample in dat:
+            for each_step, scanner in itertools.izip(each_sample, scanners):
                 scanner.scan(each_step)
 
         for scanner in scanners:
diff --git a/paddle/scripts/deb/build_scripts/.gitignore b/paddle/scripts/deb/build_scripts/.gitignore
deleted file mode 100644
index 1521c8b765..0000000000
--- a/paddle/scripts/deb/build_scripts/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dist
diff --git a/paddle/scripts/deb/build_scripts/Dockerfile b/paddle/scripts/deb/build_scripts/Dockerfile
deleted file mode 100644
index db365a65b7..0000000000
--- a/paddle/scripts/deb/build_scripts/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM paddledev/paddle:gpu-latest
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-CMD cd /root/ && bash build.sh
-
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
deleted file mode 100755
index d13dea5148..0000000000
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-apt-get update
-apt-get install -y dh-make
-cd ~
-mkdir -p ~/dist/gpu
-mkdir -p ~/dist/cpu
-mkdir -p ~/dist/cpu-noavx
-mkdir -p ~/dist/gpu-noavx
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=ON
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=ON -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu
-
-
-rm -rf *
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=OFF
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu-noavx
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu-noavx
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
deleted file mode 100755
index c38c6299f8..0000000000
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-docker build -t build_paddle_deb .
-rm -rf dist
-mkdir -p dist
-docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
-docker rm tmp_build_deb_container
-docker rmi build_paddle_deb
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
deleted file mode 100644
index 1d2dd3171a..0000000000
--- a/paddle/scripts/deb/postinst
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-echo "Post install paddle debian package."
-echo "Install some python package used for paddle. You can run "
-echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-pip install /usr/opt/paddle/share/wheels/*.whl
-
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
deleted file mode 100644
index 6435923c89..0000000000
--- a/paddle/scripts/docker/Dockerfile
+++ /dev/null
@@ -1,70 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG BUILD_WOBOQ
-ARG BUILD_AND_INSTALL
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
-
-ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=OFF
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-
-ENV HOME /root
-
-# Add bash enhancements
-COPY ./paddle/scripts/docker/root/ /root/
-
-RUN apt-get update && \
-    apt-get install -y git python-pip python-dev openssh-server bison && \
-    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
-    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 && \
-    apt-get clean -y
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel pillow BeautifulSoup && \
-    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install -U pre-commit 'requests==2.9.2' jupyter
-
-RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
-    cd .. && rm -rf cmake-3.4.1
-
-COPY . /paddle/
-RUN cd /paddle/ && git submodule update --init --recursive
-RUN /paddle/paddle/scripts/docker/build.sh
-
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-
-# Jupyter Notebook: Paddle book
-EXPOSE 8888
-
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
new file mode 100644
index 0000000000..8c35411fc3
--- /dev/null
+++ b/paddle/scripts/docker/README.md
@@ -0,0 +1,155 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+
+We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+
+We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+
+We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+
+Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+
+We want the procedure and tools also work with testing, continuous integration, and releasing.
+
+
+## Docker Images
+
+So we need two Docker images for each version of PaddlePaddle:
+
+1. `paddle:<version>-dev`
+
+   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+
+   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
+   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+
+   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
+
+  The development image should include the following tools:
+
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+
+   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+
+1. `paddle:<version>`
+
+   This is the production image, generated using the development image. This image might have multiple variants:
+
+   - GPU/AVX   `paddle:<version>-gpu`
+   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
+   - no-GPU/AVX  `paddle:<version>`
+   - no-GPU/no-AVX  `paddle:<version>-noavx`
+
+   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+
+   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
+
+
+## Development Environment
+
+Here we describe how to use above two images.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+<img src="doc/paddle-development-environment.png" width=500 />
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+
+
+## Usages
+
+### Build the Development Docker Image
+
+The following commands check out the source code to the host and build the development image `paddle:dev`:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle paddle
+cd paddle
+docker build -t paddle:dev .
+```
+
+The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
+
+
+### Build PaddlePaddle from Source Code
+
+Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+
+```bash
+docker run -v $PWD:/paddle -e "GPU=OFF" -e "AVX=ON" -e "TEST=ON" paddle:dev
+```
+
+This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
+
+`build.sh` builds the following:
+
+- PaddlePaddle binaries,
+- `$PWD/build/paddle-<version>.deb` for production installation, and
+- `$PWD/build/Dockerfile`, which builds the production Docker image.
+
+
+### Build the Production Docker Image
+
+The following command builds the production image:
+
+```bash
+docker build -t paddle -f build/Dockerfile .
+```
+
+This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+
+### Run PaddlePaddle Applications
+
+Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+
+```bash
+docker run -it -v $PWD:/work paddle /work/a.py
+```
+
+But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Build and Run PaddlePaddle Applications
+
+We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+
+```
+FROM paddlepaddle/paddle:<version>
+RUN pip install -U matplotlib jupyter ...
+COPY . /book
+EXPOSE 8080
+CMD ["jupyter"]
+```
+
+The book image is an example of PaddlePaddle application image.  We can build it
+
+```bash
+git clone https://github.com/paddlepaddle/book
+cd book
+docker build -t book .
+```
+
+### Build and Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index b2e6416c3d..c44874eede 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -7,7 +7,32 @@ function abort(){
 
 trap 'abort' 0
 set -e
-
+mkdir -p /paddle/dist/cpu
+mkdir -p /paddle/dist/gpu
+mkdir -p /paddle/dist/cpu-noavx
+mkdir -p /paddle/dist/gpu-noavx
+# Set BASE_IMAGE and DEB_PATH according to env variables
+if [ ${WITH_GPU} == "ON" ]; then
+  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
+  # additional packages to install when building gpu images
+  GPU_DOCKER_PKG="python-pip"
+  if [ ${WITH_AVX} == "ON" ]; then
+    DEB_PATH="dist/gpu/"
+    DOCKER_SUFFIX="gpu"
+  else
+    DEB_PATH="dist/gpu-noavx/"
+    DOCKER_SUFFIX="gpu-noavx"
+  fi
+else
+  BASE_IMAGE="python:2.7.13-slim"
+  if [ ${WITH_AVX} == "ON" ]; then
+    DEB_PATH="dist/cpu/"
+    DOCKER_SUFFIX="cpu"
+  else
+    DEB_PATH="dist/cpu-noavx/"
+    DOCKER_SUFFIX="noavx"
+  fi
+fi
 # If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
 # source tree to /paddle, and this scripts should build it into
 # /paddle/build.
@@ -17,8 +42,11 @@ if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
     fi
 
     mkdir -p /paddle/build # -p means no error if exists
+    cd /paddle/build
     # clean local cmake and third_party cache
-    cd /paddle/build && rm -rf * && rm -rf ../third_party
+    if [ ${DELETE_BUILD_CACHE} == 'ON' ]; then
+      rm -rf * && rm -rf ../third_party
+    fi
     cmake .. \
 	  -DWITH_DOC=${WITH_DOC:-OFF} \
 	  -DWITH_GPU=${WITH_GPU:-OFF} \
@@ -29,9 +57,15 @@ if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
 	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     make -j `nproc`
     make install
+    # generate deb package for current build
+    # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+    # FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
+    # install them in docker
+    cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+    mv /paddle/build/*.deb /paddle/${DEB_PATH}
 
     if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
-        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev 
+        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
         # Install woboq_codebrowser.
         git clone https://github.com/woboq/woboq_codebrowser /woboq
         cd /woboq
@@ -57,6 +91,54 @@ if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
     pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
     pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
     paddle version
+
+    if [[ ${DOCKER_BUILD:-FALSE} == 'TRUE' ]]; then
+	# reduce docker image size
+	rm -rf /paddle/build
+	rm -rf /usr/local/opt/paddle/share/wheels/
+    fi
+fi
+
+# generate production docker image Dockerfile
+if [ ${USE_MIRROR} ]; then
+  MIRROR_UPDATE="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
+else
+  MIRROR_UPDATE="\\"
 fi
 
+cat > /paddle/build/Dockerfile.${DOCKER_SUFFIX} <<EOF
+FROM ${BASE_IMAGE}
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_STYLE_CHECK
+
+ENV WITH_GPU=${WITH_GPU}
+ENV WITH_AVX=\${WITH_AVX:-ON}
+ENV WITH_DOC=\${WITH_DOC:-OFF}
+ENV WITH_STYLE_CHECK=\${WITH_STYLE_CHECK:-OFF}
+
+ENV HOME /root
+ENV LANG en_US.UTF-8
+
+# Use Fix locales to en_US.UTF-8
+
+RUN ${MIRROR_UPDATE}
+    apt-get update && \
+    apt-get install -y libgfortran3 ${GPU_DOCKER_PKG} && \
+    apt-get clean -y && \
+    pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' requests
+RUN pip install numpy
+# Use different deb file when building different type of images
+ADD \$PWD/${DEB_PATH}*.deb /usr/local/opt/paddle/deb/
+RUN dpkg --force-all -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb
+
+ENV PATH="/usr/local/opt/paddle/bin/:${PATH}"
+# default command shows the paddle version and exit
+CMD ["paddle", "version"]
+EOF
+
 trap : 0
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
new file mode 100644
index 0000000000..4629f9b9da
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png
new file mode 100644
index 0000000000..61a96d7198
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/docker/doc/paddle-development-environment.graffle
new file mode 100644
index 0000000000..5b164c4832
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/docker/doc/paddle-development-environment.png
new file mode 100644
index 0000000000..707ed45a33
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.png differ
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index c8ee4726c2..fac589d1d7 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -30,10 +30,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternal.h"
 
-#ifdef PADDLE_METRIC_LEARNING
-#include "paddle/internals/metric_learning/MetricTrainer.h"
-#endif
-
 DECLARE_int32(num_passes);
 
 namespace paddle {
@@ -201,12 +197,8 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-#ifdef PADDLE_METRIC_LEARNING
-  MetricTrainer trainerInternal_;
-#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index e8f31bc811..320f671ed9 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -30,7 +30,6 @@ DEFINE_bool(parallel_nn,
 DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
              "Number of ports for sending dense parameter,"
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 3e72f8356d..dc4faef833 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -19,7 +19,6 @@ limitations under the License. */
 DECLARE_bool(parallel_nn);
 DECLARE_int32(async_count);
 DECLARE_int32(port);
-DECLARE_int32(data_server_port);
 DECLARE_bool(use_gpu);
 DECLARE_int32(gpu_id);
 DECLARE_int32(trainer_count);
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 707346f2c7..0ec1c28dfb 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -23,11 +23,6 @@ enum PassType {
   PASS_TEST,    // Test pass
   PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
-  // pass for metric learning training with metric learning error, only used
-  // when we are doing KNN evaluation.
-  PASS_METRIC_TRAIN,
-  PASS_METRIC_TRAIN_WITH_NOERROR,  // Pass for metric learning training
-                                   // with no evaluation.
 };
 
 enum ParameterType {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index da937152ee..e257aa568f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2301,14 +2301,9 @@ def Generator(
 
 @config_layer('expand')
 class ExpandLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, trans_type='non-seq', bias=False, **xargs):
         super(ExpandLayer, self).__init__(
-            name, 'expand', 0, inputs=inputs, device=device)
+            name, 'expand', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
         self.config.trans_type = trans_type
@@ -2339,11 +2334,10 @@ class MaxLayer(LayerBase):
                  inputs,
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
                  bias=False,
-                 output_max_index=None):
-        super(MaxLayer, self).__init__(
-            name, 'max', 0, inputs=inputs, device=device)
+                 output_max_index=None,
+                 **xargs):
+        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
         self.config.trans_type = trans_type
         self.config.active_type = active_type
@@ -2390,15 +2384,15 @@ class SequenceLastInstanceLayer(LayerBase):
                  inputs,
                  active_type='linear',
                  trans_type='non-seq',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
             name,
             'seqlastins',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
         self.config.trans_type = trans_type
@@ -2410,39 +2404,29 @@ class SequenceLastInstanceLayer(LayerBase):
 
 @config_layer('seqfirstins')
 class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
-    def __init__(
-            self,
-            name,
-            inputs,
-            active_type='linear',
-            trans_type='non-seq',
-            device=None,
-            bias=False, ):
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 trans_type='non-seq',
+                 bias=False,
+                 **xargs):
         super(SequenceFirstInstanceLayer, self).__init__(
-            name,
-            inputs=inputs,
-            active_type=active_type,
-            device=device,
-            bias=bias)
+            name, inputs=inputs, active_type=active_type, bias=bias, **xargs)
         self.config.trans_type = trans_type
         self.config.select_first = True
 
 
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SequenceConcatLayer, self).__init__(
             name,
             'seqconcat',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
@@ -2458,15 +2442,15 @@ class SequenceReshapeLayer(LayerBase):
                  size,
                  inputs,
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(SequenceReshapeLayer, self).__init__(
             name,
             'seqreshape',
             size,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
@@ -2475,19 +2459,9 @@ class SequenceReshapeLayer(LayerBase):
 
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SubSequenceLayer, self).__init__(
-            name,
-            'subseq',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
@@ -2644,15 +2618,10 @@ class AverageLayer(LayerBase):
                  average_strategy='average',
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(AverageLayer, self).__init__(
-            name,
-            'average',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
         self.config.average_strategy = average_strategy
         self.config.trans_type = trans_type
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
@@ -2676,9 +2645,9 @@ class CosSimLayer(LayerBase):
 
 @config_layer('tensor')
 class TensorLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None, bias=True, **xargs):
+    def __init__(self, name, size, inputs, bias=True, **xargs):
         super(TensorLayer, self).__init__(
-            name, 'tensor', size, inputs=inputs, device=device, **xargs)
+            name, 'tensor', size, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
         config_assert(size > 0, 'size must be positive')
         config_assert(inputs[1].parameter_name == None,
@@ -3029,7 +2998,7 @@ class CRFLayer(LayerBase):
         super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
         config_assert(2 <= len(self.inputs) <= 3,
                       'CRFLayer must have 2 or 3 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
         self.config.coeff = coeff
 
 
@@ -3051,7 +3020,7 @@ class CRFDecodingLayer(LayerBase):
         config_assert(
             len(self.inputs) <= 2,
             'CRFDecodingLayer cannot have more than 2 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
 
 
 @config_layer('ctc')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b94f8f9a78..7cd3ce9131 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -52,6 +52,7 @@ __all__ = [
     "cos_sim",
     "hsigmoid",
     "conv_projection",
+    "mse_cost",
     "regression_cost",
     'classification_cost',
     "LayerOutput",
@@ -3572,11 +3573,14 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def regression_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     """
-    Regression Layer.
+    mean squared error cost:
+
+    ..  math::
+
+       $\frac{1}{N}\sum_{i=1}^N(t _i- y_i)^2$
 
-    TODO(yuyang18): Complete this method.
 
     :param name: layer name.
     :type name: basestring
@@ -3602,6 +3606,9 @@ def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
+regression_cost = mse_cost
+
+
 @wrap_name_default("cost")
 @layer_support()
 def classification_cost(input,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 10e59e21bc..05fd1c99d2 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -239,9 +239,9 @@ parameters {
   name: "___crf_layer_0__.w0"
   size: 24
   initial_mean: 0.0
-  initial_std: 0.5
-  dims: 4
+  initial_std: 0.408248290464
   dims: 6
+  dims: 4
   initial_strategy: 0
   initial_smart: true
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 811b38ae4a..3244181a63 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__regression_cost_0__"
+  name: "__mse_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -84,7 +84,7 @@ input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
 output_layer_names: "__cost_0__"
-output_layer_names: "__regression_cost_0__"
+output_layer_names: "__mse_cost_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -99,12 +99,12 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__regression_cost_0__"
+  layer_names: "__mse_cost_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__regression_cost_0__"
+  output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index d30f70a55c..1c0aa7f9b9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,5 +10,5 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    regression_cost(
+    mse_cost(
         input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index f5a16d5147..c686870a49 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -23,7 +23,7 @@ __all__ = ['train', 'test', 'build_dict']
 URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
-URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
 
 START = "<s>"
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 0055679a91..5ccd3d6913 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -126,9 +126,8 @@ class CostLayerTest(unittest.TestCase):
         cost3 = layer.cross_entropy_cost(input=inference, label=label)
         cost4 = layer.cross_entropy_with_selfnorm_cost(
             input=inference, label=label)
-        cost5 = layer.regression_cost(input=inference, label=label)
-        cost6 = layer.regression_cost(
-            input=inference, label=label, weight=weight)
+        cost5 = layer.mse_cost(input=inference, label=label)
+        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
         cost7 = layer.multi_binary_label_cross_entropy_cost(
             input=inference, label=label)
         cost8 = layer.rank_cost(left=score, right=score, label=score)