diff --git a/CMakeLists.txt b/CMakeLists.txt
index d82d8f633c..65fbbb481c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,8 +25,8 @@ find_package(ZLIB REQUIRED)
 find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(AVX QUIET)
-find_package(Glog)
-find_package(Gflags QUIET)
+find_package(Glog REQUIRED)
+find_package(Gflags REQUIRED)
 find_package(GTest)
 find_package(Sphinx)
 find_package(Doxygen)
@@ -40,8 +40,6 @@ option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
-option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
@@ -136,16 +134,12 @@ else(WITH_RDMA)
   add_definitions(-DPADDLE_DISABLE_RDMA)
 endif(WITH_RDMA)
 
-if(WITH_GLOG)
-    add_definitions(-DPADDLE_USE_GLOG)
-    include_directories(${LIBGLOG_INCLUDE_DIR})
-endif()
+# glog
+include_directories(${LIBGLOG_INCLUDE_DIR})
 
-if(WITH_GFLAGS)
-    add_definitions(-DPADDLE_USE_GFLAGS)
-    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-    include_directories(${GFLAGS_INCLUDE_DIRS})
-endif()
+#gflags
+add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
+include_directories(${GFLAGS_INCLUDE_DIRS})
 
 if(WITH_TESTING)
     enable_testing()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..0d4bb973ae
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1 @@
+./doc/howto/dev/contribute_to_paddle_en.md
diff --git a/WORKSPACE b/WORKSPACE
index 0b8299905a..f097c41da8 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,7 +3,7 @@ http_archive(
     name="protobuf",
     url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
     sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix="protobuf-3.1.0", )
+    strip_prefix="protobuf-3.1.0")
 
 # External dependency to gtest 1.7.0.  This method comes from
 # https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
@@ -12,4 +12,20 @@ new_http_archive(
     url="https://github.com/google/googletest/archive/release-1.7.0.zip",
     sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
     build_file="third_party/gtest.BUILD",
-    strip_prefix="googletest-release-1.7.0", )
+    strip_prefix="googletest-release-1.7.0")
+
+# External dependency to gflags.  This method comes from
+# https://github.com/gflags/example/blob/master/WORKSPACE.
+new_git_repository(
+    name="gflags",
+    tag="v2.2.0",
+    remote="https://github.com/gflags/gflags.git",
+    build_file="third_party/gflags.BUILD")
+
+# External dependency to glog.  This method comes from
+# https://github.com/reyoung/bazel_playground/blob/master/WORKSPACE
+new_git_repository(
+    name="glog",
+    remote="https://github.com/google/glog.git",
+    commit="b6a5e0524c28178985f0d228e9eaa43808dbec3c",
+    build_file="third_party/glog.BUILD")
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 05aa100eae..d319442ef1 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -s ${destination}/index_*.html ${destination}/index.html
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
     )
 
   set_property(
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
index 1a7c6a791b..afb84c6ff5 100644
--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@@ -14,13 +14,9 @@ if(WITH_STYLE_CHECK)
   find_package(PythonInterp REQUIRED)
 endif()
 
-if(WITH_GLOG)
-  find_package(Glog REQUIRED)
-endif()
+find_package(Glog REQUIRED)
 
-if(WITH_GFLAGS)
-  find_package(Gflags REQUIRED)
-endif()
+find_package(Gflags REQUIRED)
 
 if(WITH_TESTING)
   find_package(GTest REQUIRED)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index eb7db7ce2e..38366373c6 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -65,7 +65,7 @@ endmacro()
 # link_paddle_exe
 # add paddle library for a paddle executable, such as trainer, pserver.
 #
-# It will handle WITH_PYTHON/WITH_GLOG etc.
+# It will handle WITH_PYTHON etc.
 function(link_paddle_exe TARGET_NAME)
     if(WITH_RDMA)
         generate_rdma_links()
@@ -108,6 +108,8 @@ function(link_paddle_exe TARGET_NAME)
         paddle_cuda
         ${METRIC_LIBS}
         ${PROTOBUF_LIBRARY}
+        ${LIBGLOG_LIBRARY}
+        ${GFLAGS_LIBRARIES}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
         ${ZLIB_LIBRARIES}
@@ -125,16 +127,6 @@ function(link_paddle_exe TARGET_NAME)
             ${PYTHON_LIBRARIES})
     endif()
 
-    if(WITH_GLOG)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBGLOG_LIBRARY})
-    endif()
-
-    if(WITH_GFLAGS)
-        target_link_libraries(${TARGET_NAME}
-            ${GFLAGS_LIBRARIES})
-    endif()
-
     if(WITH_GPU)
         if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index a02a49a86e..da44111976 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -43,13 +43,13 @@ def extract_dict_features(pair_file, feature_file):
             mark[verb_index] = 1
             ctx_0 = sentence_list[verb_index]
 
-            if verb_index < len(labels_list) - 2:
+            if verb_index < len(labels_list) - 1:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
 
-            if verb_index < len(labels_list) - 3:
+            if verb_index < len(labels_list) - 2:
                 mark[verb_index + 2] = 1
                 ctx_p2 = sentence_list[verb_index + 2]
             else:
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 1b0fbadeb3..6fa42fd0c7 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -16,7 +16,7 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.en.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
@@ -41,7 +41,7 @@ set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
 set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
 
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.cn.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
     "${BINARY_BUILD_DIR_CN}/conf.py"
     @ONLY)
 
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
new file mode 100644
index 0000000000..3bf030004d
--- /dev/null
+++ b/doc/about/index_cn.md
@@ -0,0 +1,11 @@
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
index 8a372d2bc2..065c430cde 100644
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@@ -11,4 +11,4 @@ We hope to build an active open source community both by providing feedback and
 Credits
 --------
 
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 2d54af84b8..3718cd73a2 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -1,5 +1,5 @@
-API
-===
+API中文手册
+============
 
 DataProvider API
 ----------------
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index 8b84306ed7..d01cdaaeb7 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -1,16 +1,16 @@
-简介
-====
+经典的线性回归任务
+==================
 
 PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
 
-1. 一个经典的任务
------------------
+任务简介
+--------
 
 我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
 
 一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
 
-2. 准备数据
+准备数据
 -----------
 
 假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
@@ -28,7 +28,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
             x = random.random()
             yield [x], [2*x+0.3]
 
-3. 训练模型
+训练模型
 -----------
 
 为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
@@ -79,7 +79,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
 
 PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
 
-4. 模型检验
+模型检验
 -----------
 
 训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
@@ -106,10 +106,3 @@ PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件
 从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
 
 这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
-
-5. 推荐后续阅读
----------------
-
-- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
-- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index 4ffadc68ee..c10b897d42 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -1,15 +1,15 @@
-Basic Usage
-=============
+Simple Linear Regression
+========================
 
 PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
 
-1. A Classic Problem
----------------------
+Problem Background
+------------------
 
 Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
 
-2. Prepare the Data
---------------------
+Prepare the Data
+-----------------
 
 Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
 
@@ -26,8 +26,8 @@ Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's se
                 x = random.random()
                 yield [x], [2*x+0.3]
 
-3. Train a NeuralNetwork
--------------------------
+Train a NeuralNetwork
+----------------------
 
 To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
 
@@ -73,8 +73,8 @@ Now that everything is ready, you can train the network with a simple command li
 This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
 
 
-4. Evaluate the Model
------------------------
+Evaluate the Model
+-------------------
 
 Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 5db871d59a..aaa07d49d3 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -49,10 +49,8 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 <tbody>
 <tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
 <tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_GLOG</td><td class="left">Compile with glog. If not found, default: an internal log implementation.</td></tr>
-<tr><td class="left">WITH_GFLAGS</td><td class="left">Compile with gflags. If not found, default: an internal flag implementation.</td></tr>
 <tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">	Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
 </tbody>
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
index 171d8fba71..463b825470 100644
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -6,8 +6,6 @@ WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
 WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
 WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
 WITH_RDMA,是否开启RDMA,否
-WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
 WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
 WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
 WITH_DOC,是否编译中英文文档,否
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index e599aab2cb..3ffa858504 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -1,5 +1,5 @@
 编译与安装
-========================
+==========
 
 安装
 ++++
@@ -24,4 +24,4 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
\ No newline at end of file
+    cmake/build_from_source_cn.rst
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
index f923a1917c..d02d9c63bb 100644
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,8 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
         with_double: OFF
         with_python: ON
         with_rdma: OFF
-        with_glog: ON
-        with_gflags: ON
         with_metric_learning:
         with_timer: OFF
         with_predict_sdk:
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index a0867a6e59..c6a4d3121c 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,4 +1,4 @@
-GET STARTED
+新手入门
 ============
 
 ..  toctree::
diff --git a/doc/howto/concepts/nn_cn.rst b/doc/howto/concepts/nn_cn.rst
deleted file mode 100644
index f4d2cf490d..0000000000
--- a/doc/howto/concepts/nn_cn.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc/howto/concepts/program_concepts_cn.rst b/doc/howto/concepts/program_concepts_cn.rst
deleted file mode 100644
index af5bbdac26..0000000000
--- a/doc/howto/concepts/program_concepts_cn.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc/howto/deep_model/index_cn.rst b/doc/howto/deep_model/index_cn.rst
deleted file mode 100644
index 31f8c39af6..0000000000
--- a/doc/howto/deep_model/index_cn.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/recurrent_group_cn.md
-  rnn/hierarchical_layer_cn.rst
-  rnn/hrnn_rnn_api_compare_cn.rst
-  rnn/hrnn_demo_cn.rst
diff --git a/doc/howto/deep_model/index_en.rst b/doc/howto/deep_model/index_en.rst
deleted file mode 100644
index 00a45641e6..0000000000
--- a/doc/howto/deep_model/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn_en.rst
diff --git a/doc/howto/deep_model/rnn/hrnn_demo_cn.rst b/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
deleted file mode 100644
index 96396ff105..0000000000
--- a/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..	_algo_hrnn_demo:
-
-#################
-双层RNN的使用示例
-#################
-
-TBD
\ No newline at end of file
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
new file mode 100644
index 0000000000..9e805ca851
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -0,0 +1,9 @@
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
new file mode 100644
index 0000000000..7adc79873d
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -0,0 +1,7 @@
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/rnn_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_en.rst
rename to doc/howto/deep_model/rnn/rnn_config_en.rst
diff --git a/doc/howto/new_layer/FullyConnected.jpg b/doc/howto/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/new_layer/FullyConnected.jpg
rename to doc/howto/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000..e0a63f5a14
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,131 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+git submodule update --init --recursive
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/howto/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
similarity index 81%
rename from doc/howto/contribute_to_paddle_en.md
rename to doc/howto/dev/contribute_to_paddle_en.md
index 1decc91d62..e578f6fce8 100644
--- a/doc/howto/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -1,8 +1,8 @@
-# How to Contribute Code
+# Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code. 
- 
+workflow to merge your code.
+
 ## Code Requirements
 - Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
@@ -12,11 +12,11 @@ workflow to merge your code.
 - Pass all unit tests.
 
 The following tutorial guides you into submitting your contibution.
- 
+
 ## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
- 
+
 Just head over to the GitHub page and click the "Fork" button.
-It's just that simple. 
+It's just that simple.
 
 ## Clone
 
@@ -25,7 +25,7 @@ The **develop** is the main branch, and other user's branches are feature branch
 
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
- 
+
 ```shell
 # Clone your fork to your local machine
 git clone --branch develop https://github.com/USERNAME/Paddle.git
@@ -47,6 +47,22 @@ Then you can start to develop by making a local developement branch
 git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
+## Using `pre-commit` hook
+
+Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
+pre-commit hooks. It can help us format source codes (cpp, python), check some
+basic thing before commit (only one EOL for each file, do not add a huge file
+in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
+PR doesn't fit hook can not be merged into Paddle.
+
+To use [pre-commit](http://pre-commit.com/), you should install it by
+`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
+c/cpp sources. Please make sure clang-format 3.8+ installed.
+
+Then just run `pre-commit install` in your Paddle clone directory. When you
+commit your code, the pre-commit hook will check the local code if there is
+anything not suitable to commit, and so on.
+
 ## Commit
 
 Commit your changes by following command lines:
@@ -83,7 +99,7 @@ git pull --rebase upstream develop
 
 If there are no unique commits locally, git will simply perform a fast-forward.
 However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts. 
+probably shouldn't be), you may have to deal with conflicts.
 
 Now, your local master branch is up-to-date with everything modified upstream.
 
diff --git a/doc/howto/new_layer/index_en.rst b/doc/howto/dev/new_layer_en.rst
similarity index 99%
rename from doc/howto/new_layer/index_en.rst
rename to doc/howto/dev/new_layer_en.rst
index 922bda5b0d..0513f068f3 100644
--- a/doc/howto/new_layer/index_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -1,6 +1,6 @@
-=======================
-How to Write New Layers
-=======================
+================
+Write New Layers
+================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
diff --git a/doc/howto/write_docs/index_cn.rst b/doc/howto/dev/write_docs_cn.rst
similarity index 90%
rename from doc/howto/write_docs/index_cn.rst
rename to doc/howto/dev/write_docs_cn.rst
index a1f983b340..5051a89230 100644
--- a/doc/howto/write_docs/index_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
@@ -51,4 +51,4 @@ TBD
 
 
 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 4706d9339a..e03138723e 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -1,27 +1,37 @@
-HOW TO
-=======
+进阶指南
+========
 
-Usage
--------
+使用说明
+--------
 
 ..  toctree::
   :maxdepth: 1
 
-  concepts/use_concepts_cn.rst
-  cluster/k8s/paddle_on_k8s_cn.md
-  cluster/k8s/distributed_training_on_k8s_cn.md
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/k8s/k8s_cn.md
+  usage/cluster/k8s/k8s_distributed_cn.md
 
-Development
-------------
+开发标准
+--------
 
 ..  toctree::
   :maxdepth: 1
 
-  write_docs/index_cn.rst
-  deep_model/index_cn.rst
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md
 
-Optimization
--------------
+模型配置
+--------
 
 ..  toctree::
   :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index bd64c5b1fb..983dc743eb 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,9 +7,8 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  cmd_parameter/index_en.md
-  deep_model/index_en.rst
-  cluster/cluster_train_en.md
+  usage/cmd_parameter/index_en.md
+  usage/cluster/cluster_train_en.md
 
 Development
 ------------
@@ -17,8 +16,16 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  new_layer/index_en.rst
-  contribute_to_paddle_en.md
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst
 
 Optimization
 -------------
@@ -26,4 +33,4 @@ Optimization
 ..  toctree::
   :maxdepth: 1
 
-  optimization/index_en.rst
+  optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000..e2b0b0396e
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/gpu_profiling_en.rst b/doc/howto/optimization/gpu_profiling_en.rst
index 40ba698f4e..ed208ceaf7 100644
--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
@@ -1,5 +1,8 @@
-Profiling on PaddlePaddle
-=========================
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
 
 This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
 
@@ -49,11 +52,11 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 In this tutorial, we will focus on nvprof and nvvp.
 
 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
-above profilers. 
+above profilers.
 
 .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
    :language: c++
-   :lines: 111-124
+   :lines: 137-151
    :linenos:
 
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -79,8 +82,8 @@ As a simple example, consider the following:
 
     .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
-        :emphasize-lines: 8-10,13
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
         :linenos:
 
 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -90,31 +93,31 @@ As a simple example, consider the following:
         cmake .. -DWITH_TIMER=ON
         make
 
-3. Execute your code and observe the results (see the emphasize-lines). 
+3. Execute your code and observe the results (see the emphasize-lines).
 
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler                                                                             
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
-        [==========] Running 1 test from 1 test case.                                                                                                
-        [----------] Global test environment set-up.                                                                                                 
-        [----------] 1 test from Profiler                                                                                                            
-        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
         I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"                                                                                                                  
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
-        [----------] 1 test from Profiler (136 ms total)                                                                                             
-                                                                                                                                                    
-        [----------] Global test environment tear-down                                                                                               
-        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
         [  PASSED  ] 1 test.
 
 nvprof profiler
@@ -126,7 +129,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
+        :lines: 137-151
         :emphasize-lines: 6-7
         :linenos:
 
@@ -147,42 +150,42 @@ Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
-    ==78544== Profiling result:                                                                                                                                                
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
-                                                                                                                                                                            
-    ==78544== API calls:                                                                                                                                                       
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
     0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
 
 
diff --git a/doc/howto/optimization/index_en.rst b/doc/howto/optimization/index_en.rst
deleted file mode 100644
index 1e2f16b5da..0000000000
--- a/doc/howto/optimization/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Tune GPU Performance
-===========================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling_en.rst
diff --git a/doc/howto/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
similarity index 99%
rename from doc/howto/cluster/cluster_train_en.md
rename to doc/howto/usage/cluster/cluster_train_en.md
index 1de34a6a99..2fd24e532e 100644
--- a/doc/howto/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,4 +1,4 @@
-# How to Run Distributed Training
+# Run Distributed Training
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
diff --git a/doc/howto/cluster/k8s/Dockerfile b/doc/howto/usage/cluster/k8s/Dockerfile
similarity index 100%
rename from doc/howto/cluster/k8s/Dockerfile
rename to doc/howto/usage/cluster/k8s/Dockerfile
diff --git a/doc/howto/cluster/k8s/job.yaml b/doc/howto/usage/cluster/k8s/job.yaml
similarity index 100%
rename from doc/howto/cluster/k8s/job.yaml
rename to doc/howto/usage/cluster/k8s/job.yaml
diff --git a/doc/howto/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
diff --git a/doc/howto/cluster/k8s/paddle_on_k8s_cn.md b/doc/howto/usage/cluster/k8s/k8s_cn.md
similarity index 99%
rename from doc/howto/cluster/k8s/paddle_on_k8s_cn.md
rename to doc/howto/usage/cluster/k8s/k8s_cn.md
index f8c9f19a9f..2575701053 100644
--- a/doc/howto/cluster/k8s/paddle_on_k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Paddle On Kubernetes：单机训练
+# Kubernetes 单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
similarity index 99%
rename from doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
rename to doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
index 64f8fd4b43..d4d01f2759 100644
--- a/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
@@ -1,5 +1,4 @@
-
-# PaddlePaddle on Kubernetes：分布式训练
+# Kubernetes 分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
diff --git a/doc/howto/cluster/k8s/start.sh b/doc/howto/usage/cluster/k8s/start.sh
similarity index 100%
rename from doc/howto/cluster/k8s/start.sh
rename to doc/howto/usage/cluster/k8s/start.sh
diff --git a/doc/howto/cluster/k8s/start_paddle.py b/doc/howto/usage/cluster/k8s/start_paddle.py
similarity index 100%
rename from doc/howto/cluster/k8s/start_paddle.py
rename to doc/howto/usage/cluster/k8s/start_paddle.py
diff --git a/doc/howto/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/arguments_en.md
rename to doc/howto/usage/cmd_parameter/arguments_en.md
diff --git a/doc/howto/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/detail_introduction_en.md
rename to doc/howto/usage/cmd_parameter/detail_introduction_en.md
diff --git a/doc/howto/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
similarity index 80%
rename from doc/howto/cmd_parameter/index_en.md
rename to doc/howto/usage/cmd_parameter/index_en.md
index a6c236db61..2a96e7e976 100644
--- a/doc/howto/cmd_parameter/index_en.md
+++ b/doc/howto/usage/cmd_parameter/index_en.md
@@ -1,7 +1,7 @@
 ```eval_rst
 ..  _cmd_line_index:
 ```
-# How to Set Command-line Parameters
+# Set Command-line Parameters
 
 * [Use Case](use_case_en.md)
 * [Arguments](arguments_en.md)
diff --git a/doc/howto/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/use_case_en.md
rename to doc/howto/usage/cmd_parameter/use_case_en.md
diff --git a/doc/howto/concepts/src/pserver_topology.dot b/doc/howto/usage/concepts/src/pserver_topology.dot
similarity index 100%
rename from doc/howto/concepts/src/pserver_topology.dot
rename to doc/howto/usage/concepts/src/pserver_topology.dot
diff --git a/doc/howto/concepts/src/trainer_config.py b/doc/howto/usage/concepts/src/trainer_config.py
similarity index 100%
rename from doc/howto/concepts/src/trainer_config.py
rename to doc/howto/usage/concepts/src/trainer_config.py
diff --git a/doc/howto/concepts/use_concepts_cn.rst b/doc/howto/usage/concepts/use_concepts_cn.rst
similarity index 99%
rename from doc/howto/concepts/use_concepts_cn.rst
rename to doc/howto/usage/concepts/use_concepts_cn.rst
index 6b87522088..77ba764419 100644
--- a/doc/howto/concepts/use_concepts_cn.rst
+++ b/doc/howto/usage/concepts/use_concepts_cn.rst
@@ -1,6 +1,6 @@
-#########################
-PaddlePaddle 基本使用概念
-#########################
+############
+基本使用概念
+############
 
 PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
 
diff --git a/doc/conf.py.cn.in b/doc/templates/conf.py.cn.in
similarity index 100%
rename from doc/conf.py.cn.in
rename to doc/templates/conf.py.cn.in
diff --git a/doc/conf.py.en.in b/doc/templates/conf.py.en.in
similarity index 100%
rename from doc/conf.py.en.in
rename to doc/templates/conf.py.en.in
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
index fddaee5b2d..adc75978a7 100644
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
@@ -1,23 +1,24 @@
-# TUTORIALS
-There are several examples and demos here.
+# 完整教程
 
-## Quick Start
+## 快速入门
 
-* [Quick Start](quick_start/index_cn.rst)
+使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
 
-## Image
+* [阅读教程](quick_start/index_cn.rst)
+
+## 图像
 
 * TBD
 
-## NLP
+## 自然语言处理
 
-* [Sentiment Analysis](sentiment_analysis/index_cn.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_cn.rst)
+* [情感分类](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)
 
-## Recommendation
+## 个性化推荐
 
 * TBD
 
-## Model Zoo
+## 常用模型
 
 * TBD
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index 039ec4b4a4..63b2091c24 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -17,7 +17,6 @@ There are several examples and demos here.
 
 ## Recommendation
 
-* [MovieLens Dataset](rec/ml_dataset_en.md)
 * [MovieLens Regression](rec/ml_regression_en.rst)
 
 ## Model Zoo
diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/tutorials/quick_start/index_cn.rst
index 754c2f6212..936f16118a 100644
--- a/doc/tutorials/quick_start/index_cn.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
@@ -1,5 +1,6 @@
-PaddlePaddle快速入门教程
-========================
+=============
+快速入门教程
+=============
 
 我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
 介绍PaddlePaddle的基本使用方法。
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 9b2d122a09..6ad1d79e59 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -17,22 +17,18 @@ add_library(paddle_api STATIC
         ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
+list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
 
-if(WITH_GFLAGS)
-  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
-
-  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-    # Because gflags compiled by cmake, so it is imported by cmake target,
-    # not a real library path. Get the real library path here.
-    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-  else()
-    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-  endif()
+if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+# Because gflags compiled by cmake, so it is imported by cmake target,
+# not a real library path. Get the real library path here.
+message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+else()
+set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
 endif()
 
-
 configure_file(
     paddle_api_config.py.in
     ${PROJ_ROOT}/paddle/api/paddle_api_config.py
@@ -57,7 +53,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
-	    ${PY_PADDLE_PYTHON_FILES}
+        ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 59b47d4b1c..d83dc380be 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -27,9 +27,9 @@ limitations under the License. */
 
 using paddle::real;
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
 
 struct TrainerPrivate : public paddle::Trainer {
   bool _trainOneBatch(size_t batchSize);
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index a2352250c3..23542b952b 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -8,9 +8,7 @@ CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-WITH_GLOG="@WITH_GLOG@"
 LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
-WITH_GFLAGS="@WITH_GFLAGS@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 85cc54700f..51d7dfee58 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -47,10 +47,8 @@ try:
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
             self.glog_libs = LIBGLOG_LIBRARY
 
-            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
@@ -88,6 +86,8 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.glog_libs),
+                self.normalize_flag(self.gflags_libs),
                 self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
@@ -96,10 +96,6 @@ try:
 
             if self.with_python:
                 libs.append(self.normalize_flag(self.python_libs))
-            if self.with_glog:
-                libs.append(self.normalize_flag(self.glog_libs))
-            if self.with_gflags:
-                libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
             if self.with_coverage:
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 7111224d59..8cddf10d40 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-               4096,
-               "Specify cuDNN max workspace limit, in units MB, "
-               "4096MB=4GB by default.");
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
 
 namespace dynload {
 
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 41787f6c0a..a71eecba27 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format 
+// Because clang-format 4.X and clang-format 3.8+ format
 // following lines in different. So disable clang-format.
 #include "hl_cuda.h"
 #include <cuda_profiler_api.h>
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
+#include "hl_cuda.h"
 #include "hl_cuda.ph"
 #include "hl_dso_loader.h"
 #include "hl_thread.ph"
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index f509b89243..54c7620fc0 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -16,21 +16,21 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir,
-                "",
-                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib. If empty [default], dlopen "
-                "will search cudnn from LD_LIBRARY_PATH");
-
-P_DEFINE_string(cuda_dir,
-                "",
-                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-                "libcudart can not be specified by cuda_dir, since some "
-                "build-in function in cudart already ran before main entry). "
-                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-
-P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcudart can not be specified by cuda_dir, since some "
+              "build-in function in cudart already ran before main entry). "
+              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index d16ecca2d9..c6f5cab191 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "DataProviderGroup.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_double(memory_threshold_on_load_data,
-                1.0,
-                "stop loading data when memory is not sufficient");
+DEFINE_double(memory_threshold_on_load_data,
+              1.0,
+              "stop loading data when memory is not sufficient");
 
 namespace paddle {
 
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7556d21e01..2f99281911 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index a7324f5545..88c098b355 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu,
-              true,
-              "If true, do not allow multiple models on one GPU device");
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
-P_DECLARE_bool(external);
+DECLARE_bool(external);
 #endif
 
 namespace paddle {
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index ee1c92bdf5..8f68b3d66b 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
 
 static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
 static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 66f0606a38..3551df4e17 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -54,7 +54,7 @@ void DataLayer::copyDataToOutput(Argument& output) {
     output.setFrameWidth(config_.width());
   } else {
     output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameWidth(data_.getFrameWidth());
   }
   output.cpuSequenceDims = data_.cpuSequenceDims;
   output.sequenceStartPositions = data_.sequenceStartPositions;
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c9e121047b..c47943f81c 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "TransLayer.h"
 #include "ValidationLayer.h"
 
-P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 452091eff4..2543d1b49a 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(prev_batch_state);
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 9f3bf76a2d..85812c9d66 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index 471055429d..4c1de7b3b7 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index dffc24936f..1d5e7de1ba 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "LayerGradUtil.h"
 
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 namespace paddle {
 real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index e656da5b8f..e07c60861a 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
 namespace paddle {
 
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 20a6126d0b..7d7e68da5c 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -25,8 +25,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 void testActivation(const string& act) {
   LOG(INFO) << "test activation: " << act;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 3bd4e321b7..7f5fcb670b 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -27,11 +27,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the batchNormLayer can be followed by a ConvLayer
 TEST(Layer, batchNorm) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 83100e3bec..99202c2d57 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -28,11 +28,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 02763406a3..2ab18f8868 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -28,11 +28,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 7a930aebcf..e07066dad8 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -21,9 +21,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 enum InputType {
   INPUT_DATA,         // dense vector
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 9f8b197df5..8a8d094ed3 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -26,11 +26,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 TEST(Operator, dot_mul) {
   TestConfig config;
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index baa55aa025..fc60228f81 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -25,10 +25,10 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DEFINE_bool(use_label, true, "input label or sequence label");
-P_DEFINE_bool(static_para, false, "static parameter");
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
 
 struct DataIn {
   std::vector<Argument> inArgs;
@@ -267,8 +267,8 @@ TEST(Compare, img_conv2) {
 }
 #endif
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
   if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
     compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 436318d356..5f8bc5ecd0 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
 namespace paddle {
 namespace unittest {
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index a351667d8b..874aabf37c 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
 
-P_DECLARE_int32(seed);
+DECLARE_int32(seed);
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index cd96ca7c84..f91c788863 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -23,9 +23,9 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(rnn_use_batch);
-P_DECLARE_int32(fixed_seq_length);
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
 
 void checkError(const Matrix& matrix1, const Matrix& matrix2) {
   CHECK(matrix1.getHeight() == matrix2.getHeight());
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 4f3a95a535..ab23d00a2c 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -29,11 +29,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(num_passes);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(config_args);
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
 
 size_t fcLayerWidth = 1024;
 
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 700425412c..0a4a814d52 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -25,7 +25,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(use_gpu);
 
 const real* getData(const Matrix& matrix) {
   if (matrix.useGpu()) {
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 3091743123..b61c6b2d49 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update,
-              false,
-              "Whether to allow inefficient sparse update");
+DEFINE_bool(allow_inefficient_sparse_update,
+            false,
+            "Whether to allow inefficient sparse update");
 
 namespace paddle {
 
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index badb4b9c1c..9364feb4a1 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_bool(allow_inefficient_sparse_update);
+DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index f9a2c12cd5..56e5442394 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "Allocator.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(pool_limit_size,
-               536870912,
-               "maximum memory size managed by a memory pool, default is 512M");
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
 
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 1bf6a0cc43..2c458cba9c 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 #ifndef PADDLE_TYPE_DOUBLE
-P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
 #else
-P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index e91daa3717..65d01a1571 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -245,6 +245,8 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
                                     bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index 630f15c8cf..dbb738e98b 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include <cmath>
 
-P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 986ae1539b..1673fc6e53 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(enable_grad_share,
-               (100 * 1024 * 1024),
-               "threshold for enable gradient parameter share for batch "
-               "multi-cpu training");
-P_DEFINE_int32(
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
     grad_share_block_num,
     64,
     "block number of gradient parameter share for batch multi-cpu training");
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index a43def98c5..b4ac7a2506 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_string(pservers);
+DECLARE_string(pservers);
 
 namespace paddle {
 
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 329dfb0fb3..cbc105e651 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -31,23 +31,23 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages,
-              false,
-              "if message size is small, recommend set it True to enable quick "
-              "ack and no delay");
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock send buff size, can reduce network congestion if "
-               "set carefully");
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock recv buff size");
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 86fd1c5276..a97859f83f 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 
-P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 5255394949..eed71ccb43 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "ProtoServer.h"
 #include "SparseParameterDistribution.h"
 
-P_DECLARE_int32(parallel_thread_num);
+DECLARE_int32(parallel_thread_num);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 2cb4c93535..856fa0ad1a 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -30,11 +30,11 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min,
-                1.0,
-                "control config_.async_lagged_grad_discard_ratio() min value");
-P_DEFINE_double(
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
     async_lagged_ratio_default,
     1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 61c139981e..b0cf22e1fb 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -38,7 +38,7 @@ limitations under the License. */
 
 #include "ProtoServer.h"
 
-P_DECLARE_int32(port);
+DECLARE_int32(port);
 
 namespace paddle {
 
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 0068f85b52..6dd725db30 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -20,26 +20,26 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver,
-              false,
-              "check whether sparse parameter exhibts balanced distribution at "
-              "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log,
-              false,
-              "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches,
-               100,
-               "run sparse parameter distribution check for N batches");
-P_DEFINE_double(
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
     check_sparse_distribution_ratio,
     0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree,
-                2.0,
-                "the ratio of maximum data size and minimun data size for "
-                "different pserver");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
 
 namespace paddle {
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6e63c4f678..066a6c0293 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -195,9 +195,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   channel_.reset(new SocketChannel(sockfd));
 }
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 10000000, "Data size");
-P_DEFINE_int32(loop_time, 100000, "test loop time");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index 4257a2308d..8e7231a9e1 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -21,9 +21,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(num_gradient_servers);
-P_DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-P_DEFINE_int32(server_cpu, 0, "assign server cpu");
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 3880dde5e3..9f86ee80f4 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 50000000, "Data size");
-P_DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-P_DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index ace2c0dee9..283fd34a6d 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,8 +21,6 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_glog: @WITH_GLOG@"
-        echo "    with_gflags: @WITH_GFLAGS@"
         echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/precommit.sh
index 5ad84f1821..7a59b1131d 100755
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
@@ -12,6 +12,9 @@ cd ..
 export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
-pre-commit run -a
+
+if ! pre-commit run -a ; then
+  git diff  --exit-code
+fi
 
 trap : 0
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 1cf29a39b9..91d89b61a3 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/pserver/ParameterServer2.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DEFINE_string(model_dir, "", "Directory for separated model files");
-P_DEFINE_string(model_file, "", "File for merged model file");
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(model_file, "", "File for merged model file");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index b7f7b93b8d..974e78fa17 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_int32(trainer_id);
-P_DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
 
 namespace paddle {
 
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index bee7f061fe..9caa92a4d7 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
-P_DECLARE_int32(trainer_count);
+DECLARE_int32(trainer_count);
 
 namespace paddle {
 
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 85610ec04e..1eec2c432d 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -38,60 +38,56 @@ limitations under the License. */
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
 
-P_DEFINE_string(config, "", "Trainer config file");
-
-P_DEFINE_int32(test_period,
-               0,
-               "if equal 0, do test on all test data at the end of "
-               "each pass. While if equal non-zero, do test on all test "
-               "data every test_period batches");
-P_DEFINE_bool(test_all_data_in_one_period,
-              false,
-              "This option was deprecated, since we will always do "
-              "test on all test set ");
-
-P_DEFINE_bool(local, true, "Train in local mode or not");
-
-P_DEFINE_int32(average_test_period,
-               0,
-               "Do test on average parameter every so"
-               " many batches. MUST be devided by FLAGS_log_period."
-               " Default 0 means do not test average parameter");
-
-P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches,
-               0,
-               "Save parameters every so many batches in one pass");
-P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass,
-               0,
-               "Start training from this pass. "
-               "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass,
-               -1,
-               "Will load parameter start from this pass to test");
-P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-P_DEFINE_bool(with_cost, true, "enable cost layer or not");
-P_DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-P_DEFINE_int32(num_passes, 100, "train for so many passes");
-
-P_DEFINE_string(config_args,
-                "",
-                "arguments passed to config file."
-                "Format: key1=value1,key2=value2");
-
-P_DEFINE_bool(save_only_one,
-              false,
-              "Save only parameters in last pass, remove previous.");
-
-P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir,
-                "",
-                "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list,
-                "",
-                "File that saves the model list when evaluation");
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index cabbb4acd1..7cbf18ace7 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/internals/metric_learning/MetricTrainer.h"
 #endif
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
index 5c3177c808..173653c816 100644
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_int32(test_period);
+DECLARE_int32(test_period);
 
-P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 2017a08d20..60ac8459a1 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -18,16 +18,16 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(save_dir);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_bool(local);
-P_DECLARE_bool(with_cost);
-P_DECLARE_bool(with_gpu);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index a017cdec9d..039fcdb524 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period,
-               0,
-               "Whether to show parameter stats during training");
+DEFINE_int32(show_parameter_stats_period,
+             0,
+             "Whether to show parameter stats during training");
 
-P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
+DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
 
-P_DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
+DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
-P_DECLARE_bool(local);
+DECLARE_bool(local);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 0a4d56b892..947f9cadcc 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -22,21 +22,20 @@ limitations under the License. */
 #include "Trainer.h"
 #include "paddle/pserver/RDMANetwork.h"
 
-P_DEFINE_bool(start_pserver, false, "Whether to start pserver");
-P_DECLARE_int32(gpu_id);
-P_DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(rdma_tcp);
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-// write logs instantly (never buffer log messages)
-#ifdef PADDLE_USE_GLOG
+  // write logs instantly (never buffer log messages)
   FLAGS_logbuflevel = -1;
-#endif
+
   initMain(argc, argv);
   initPython(argc, argv);
 
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 63fa48540c..72fc76bea3 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -24,10 +24,10 @@ using namespace std;     // NOLINT
 
 static const string& configFile = "trainer/tests/sample_trainer_config.conf";
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
 
 struct comData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3fea3a3c24..a7000eb77e 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -25,22 +25,22 @@ using namespace std;     // NOLINT
 static const string& configFile1 =
     "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
-
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio,
-                0.0f,
-                "max diff ratio allowed for parameters value");
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 8a4556721d..80c61e259e 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -22,25 +22,25 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              false,
-              "whether need to run in double accuracy");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_int32(seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
 
 struct ComData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 673ef289d8..383505f813 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -22,20 +22,20 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              true,
-              "whether need to run in double accuracy (recommended)");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            true,
+            "whether need to run in double accuracy (recommended)");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 322121a579..0c79404eee 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -18,11 +18,11 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
-P_DEFINE_string(merger,
-                "./paddle_merge_model",
-                "path to paddle_merge_model binary");
+DECLARE_string(config);
+DECLARE_string(config_args);
+DEFINE_string(merger,
+              "./paddle_merge_model",
+              "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 0fede59f8d..371282dd6b 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -28,10 +28,10 @@ static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(allow_only_one_model_on_one_gpu);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
 
 void checkGradientTest(const string& configFile,
                        bool useGpu,
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 0b587ecce1..ee21008aec 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -27,12 +27,12 @@ static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
 public:
@@ -122,10 +122,10 @@ TEST(average_window_cpu, gpu4) {
 #endif
 
 // 3. test trainer + pserver.
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
 
 double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   auto gradientMachine = trainer.getGradientMachine();
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 7d8dfd788f..03446b3b2f 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -30,7 +30,7 @@ static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
 static string expectFile =                                           // NOLINT
     "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
 
-P_DECLARE_string(config_args);
+DECLARE_string(config_args);
 
 vector<float> readRetFile(const string& fname) {
   ifstream inFile(fname);
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index 9dde155aca..a6dbdcae3f 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -20,15 +20,15 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(log_barrier_abstract,
-              true,
-              "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes,
-               5,
-               "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log,
-              false,  // for performance tuning insight
-              "if true, always show barrier abstract even with little gap");
+DEFINE_bool(log_barrier_abstract,
+            true,
+            "if true, show abstract of barrier performance");
+DEFINE_int32(log_barrier_lowest_nodes,
+             5,
+             "how many lowest node will be logged");
+DEFINE_bool(log_barrier_show_log,
+            false,  // for performance tuning insight
+            "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 51558b45a1..63f16bc54c 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -13,220 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CommandLineParser.h"
-#ifndef PADDLE_USE_GFLAGS
-#include <stdlib.h>
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "paddle/utils/StringUtil.h"
 
-namespace paddle {
-
-static constexpr int kStatusOK = 0;
-static constexpr int kStatusInvalid = 1;
-static constexpr int kStatusNotFound = 2;
-
-/**
- * \brief: Convert a string to any type value.
- *
- * \note: It will specialize by type T that is supported.
- */
-template <typename T>
-bool StringToValue(const std::string& content, T* value) {
-  bool ok;
-  *value = str::toWithStatus<T>(content, &ok);
-  return ok;
-}
-
-template <>
-bool StringToValue<bool>(const std::string& content, bool* value) {
-  std::string tmp = content;
-
-  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
-    if (in <= 'Z' && in >= 'A') {
-      return in - ('Z' - 'z');
-    } else {
-      return in;
-    }
-  });  // tolower.
-
-  if (tmp == "true" || tmp == "1") {
-    *value = true;
-    return true;
-  } else if (tmp == "false" || tmp == "0") {
-    *value = false;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <>
-bool StringToValue<std::string>(const std::string& content,
-                                std::string* value) {
-  *value = content;
-  return true;
-}
-
-/**
- * \brief Parse argument "--blah=blah".
- *
- * \param argument: The command line argument string, such as "--blah=blah"
- * \param [out] extraInfo: The details error message for parse argument.
- * \return: kStatusOK, kStatusInvalid, kStatusNotFound
- */
-template <typename T>
-int ParseArgument(const std::string& argument, std::string* extraInfo) {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    T* value = command.value;
-
-    std::string prefix = "--";
-    prefix += name;
-    prefix += "=";
-    std::string content;
-    if (str::startsWith(argument, prefix)) {
-      content = argument.substr(prefix.size(), argument.size() - prefix.size());
-    } else {
-      prefix = "-";
-      prefix += name;
-      prefix += "=";
-      if (str::startsWith(argument, prefix)) {
-        content =
-            argument.substr(prefix.size(), argument.size() - prefix.size());
-      }
-    }
-
-    if (!content.empty()) {
-      if (StringToValue(content, value)) {
-        return kStatusOK;
-      } else {
-        *extraInfo = name;
-        return kStatusInvalid;
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * @brief ParseBoolArgumentExtra
- * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
- * false
- */
-static int ParseBoolArgumentExtra(const std::string& argument,
-                                  std::string* extraInfo) {
-  (void)(extraInfo);  // unused extraInfo, just make api same.
-
-  //! @warning: The order and content of prefixes is DESIGNED for parsing
-  //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
-  //! use of this fact. DO NOT CHANGE IT without reading how to parse command
-  //! below.
-  static const std::vector<std::pair<const char*, bool>> prefixes = {
-      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
-
-  for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
-       flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
-    if (argument.size() > command.name.size()) {
-      //! Use the length of prefix is 1, 2, 3, 4.
-      size_t diff = argument.size() - command.name.size() - 1UL;
-      if (diff < prefixes.size()) {
-        const std::string& prefix = std::get<0>(prefixes[diff]);
-        if (argument == prefix + command.name) {
-          *command.value = std::get<1>(prefixes[diff]);
-          return kStatusOK;
-        }
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * \brief: Print command line arguments' usage with type T.
- */
-template <typename T>
-static void PrintTypeUsage() {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    name = "--" + name;  // Program will exit, so modify name is safe.
-    std::string& desc = command.text;
-    T& defaultValue = command.defaultValue;
-    std::cerr << std::setw(20) << name << ": " << desc
-              << "[default:" << defaultValue << "]." << std::endl;
-  }
-}
-
-template <typename... TS>
-static void PrintTypeUsages() {
-  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
-  (void)(unused);
-}
-/**
- * \brief: Print all usage, and exit(1)
- */
-static void PrintUsageAndExit(const char* argv0) {
-  std::cerr << "Program " << argv0 << " Flags: " << std::endl;
-  PrintTypeUsages<bool, int32_t, std::string, double, int64_t, uint64_t>();
-  exit(1);
-}
-
-/**
- * \brief: Print the error flags, usage, and exit.
- */
-static void PrintParseError(const std::string& name,
-                            const char* actualInput,
-                            const char* arg0) {
-  std::cerr << "Parse command flag " << name << " error! User input is "
-            << actualInput << std::endl;
-  PrintUsageAndExit(arg0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
-  int unused_argc = 1;
-  std::string extra;
-  for (int i = 1; i < *argc; ++i) {
-    std::string arg = argv[i];
-    int s = kStatusInvalid;
-#define ParseArgumentWithType(type)           \
-  s = ParseArgument<type>(arg, &extra);       \
-  if (s == kStatusOK) {                       \
-    continue;                                 \
-  } else if (s == kStatusInvalid) {           \
-    PrintParseError(extra, argv[i], argv[0]); \
-  }
-
-    ParseArgumentWithType(bool);  // NOLINT
-    ParseArgumentWithType(int32_t);
-    ParseArgumentWithType(double);  // NOLINT
-    ParseArgumentWithType(int64_t);
-    ParseArgumentWithType(uint64_t);
-    ParseArgumentWithType(std::string);
-
-#undef ParseArgumentWithType
-    s = ParseBoolArgumentExtra(arg, &extra);
-    if (s == kStatusOK) {
-      continue;
-    }
-
-    if (withHelp && (arg == "--help" || arg == "-h")) {
-      PrintUsageAndExit(argv[0]);
-    }
-
-    // NOT Found for all flags.
-    std::swap(argv[unused_argc++], argv[i]);
-  }
-  *argc = unused_argc;
-}
-
-}  // namespace paddle
-#else
 namespace paddle {
 #ifndef GFLAGS_NS
 #define GFLAGS_NS google
@@ -243,4 +30,3 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
 }
 
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index b4449c6f09..4e89f90bb9 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -13,167 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifndef PADDLE_USE_GFLAGS
-#include <stdint.h>
-#include <string>
-#include <vector>
-#include "DisableCopy.h"
 
-namespace paddle {
-
-namespace flags_internal {
-
-/**
- * Command line flag registry for special type T. It will store all command
- * arguments settings. such as name, default value.
- */
-template <typename T>
-struct CommandLineFlagRegistry {
-  /**
-   * The factory method of CommandLineFlagRegistry
-   *
-   * \return: The singleton instance of CommandLineFlagRegistry.
-   */
-  static CommandLineFlagRegistry* Instance() {
-    static CommandLineFlagRegistry instance_;
-    return &instance_;
-  }
-
-  struct Command {
-    /// name of argument.
-    std::string name;
-    /// address of actual variable. such as FLAGS_xxx.
-    T* value;
-    /// usage text.
-    std::string text;
-    /// default value of this command.
-    T defaultValue;
-  };
-
-  /// the command line arguments of type T.
-  std::vector<Command> commands;
-
-  DISABLE_COPY(CommandLineFlagRegistry);
-
-private:
-  inline CommandLineFlagRegistry() {}
-};
-
-/**
- *Helper class to register command line flag.
- */
-template <typename T>
-struct CommandLineFlagRegister {
-  /**
-   * \brief: Register a command line argument
-   *
-   * \param [in] name: The command line name.
-   * \param [inout] val: The command line argument instance, FLAGS_xxx.
-   * \param [in] desc: The command line helper message.
-   */
-  CommandLineFlagRegister(const std::string& name,
-                          T* val,
-                          const std::string desc) {
-    CommandLineFlagRegistry<T>::Instance()->commands.push_back(
-        {name, val, desc, *val});
-  }
-};
-
-/**
- * \brief: Define a command line arguments.
- *
- * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the
- *variable
- *is 'FLAGS_name'
- * \param default_value: The default value of command line argument.
- * \param text: The description in command line argument.
- */
-#define PADDLE_DEFINE_variable(type, name, default_value, text) \
-  type FLAGS_##name = default_value;                            \
-  namespace paddle_flags_internal {                             \
-  paddle::flags_internal::CommandLineFlagRegister<type>         \
-      flags_internal_var_##name(#name, &FLAGS_##name, text);    \
-  }  // namespace paddle_flags_internal
-
-/**
- * Declare a variable to use.
- */
-#define PADDLE_DECLARE_variable(type, name) extern type FLAGS_##name;
-
-// DEFINE macro for each types.
-#define P_DEFINE_int32(name, default_value, text) \
-  PADDLE_DEFINE_variable(int32_t, name, default_value, text)
-
-#define P_DEFINE_bool(name, default_value, text) \
-  PADDLE_DEFINE_variable(bool, name, default_value, text)
-
-#define P_DEFINE_string(name, default_value, text) \
-  PADDLE_DEFINE_variable(std::string, name, default_value, text)
-
-#define P_DEFINE_double(name, default_value, text) \
-  PADDLE_DEFINE_variable(double, name, default_value, text)
-
-#define P_DEFINE_int64(name, default_value, text) \
-  PADDLE_DEFINE_variable(int64_t, name, default_value, text)
-
-#define P_DEFINE_uint64(name, default_value, text) \
-  PADDLE_DEFINE_variable(uint64_t, name, default_value, text)
-
-// Declare macro for each types.
-#define P_DECLARE_int32(name) PADDLE_DECLARE_variable(int32_t, name)
-#define P_DECLARE_bool(name) PADDLE_DECLARE_variable(bool, name)
-#define P_DECLARE_string(name) PADDLE_DECLARE_variable(std::string, name)
-#define P_DECLARE_double(name) PADDLE_DECLARE_variable(double, name)
-#define P_DECLARE_int64(name) PADDLE_DECLARE_variable(int64_t, name)
-#define P_DECLARE_uint64(name) PADDLE_DECLARE_variable(uint64_t, name)
-}  // namespace flags_internal
-
-/**
- * \brief Parse command line flags. If parse error, just failed and exit 1.
- *
- * \param [inout] argc: The command argument count. This method will modify
- *argc, and left unused arguments.
- * \param [inout] argv: The command argument values. This method will modify
- *argv, and left unused arguments.
- * \param [in] withHelp: True will parse '-h' and '--help' to print usage.
- *
- * \note: The Command line flags format basically as follow:
- *
- *  * If the type of flag is not bool, then the follow format of command line
- *    will be parsed:
- *    * --flag_name=value
- *    * -flag_name=value
- *
- *  * If the flag is bool, then:
- *    * --flag_name=value, -flag_name=value will be parsed.
- *       * if value.tolower() == "true"| "1" will be treated as true.
- *       * else if value.tolower() == "false" | "0" will be treated as false.
- *    * --flag_name will be parsed as true.
- *    * --noflag_name will be parsed as false.
- */
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
-
-}  // namespace paddle
-
-#else  // if use gflags.
 #include <gflags/gflags.h>
 
-#define P_DEFINE_int32 DEFINE_int32
-#define P_DEFINE_bool DEFINE_bool
-#define P_DEFINE_string DEFINE_string
-#define P_DEFINE_double DEFINE_double
-#define P_DEFINE_int64 DEFINE_int64
-#define P_DEFINE_uint64 DEFINE_uint64
-#define P_DECLARE_int32 DECLARE_int32
-#define P_DECLARE_bool DECLARE_bool
-#define P_DECLARE_string DECLARE_string
-#define P_DECLARE_double DECLARE_double
-#define P_DECLARE_int64 DECLARE_int64
-#define P_DECLARE_uint64 DECLARE_uint64
 namespace paddle {
 void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
 
 }  // namespace paddle
-
-#endif
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 083f5c509a..66b38218a7 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include "CommandLineParser.h"
 
-P_DEFINE_bool(
+DEFINE_bool(
     layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 1c9e602f45..59d6cbdc51 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -15,65 +15,61 @@ limitations under the License. */
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
-P_DEFINE_bool(use_gpu, false, "Only support CPU training");
+DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
-P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
+DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-P_DEFINE_bool(
-    parallel_nn,
-    false,
-    "Whether to use multi-threads to calculate one neural network."
-    "If it was set false, use gpu_id specify which gpu core to use"
-    "(the device property in the trainer config file will be ingored)."
-    "If it was set true, the gpu core is specified by the trainer"
-    "  config file(gpu_id will be ignored).");
-P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-P_DEFINE_int32(port, 20134, "Listening port for pserver");
-P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num,
-               1,
-               "The ports number for parameter send,"
-               " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse,
-               0,
-               "The ports number for parameter send,"
-               " increment based on default (port + ports_num)");
-P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-P_DEFINE_int32(
-    trainer_id,
-    0,
-    "For distributed training, each trainer must be given an unique id"
-    " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-    " trainer");
-P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy,
-                "fail",
-                "which operation to take on load model fails. support "
-                "fail/rand/zero only.");
-P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server,
-               500,
-               "Log progress every so many batches at pserver end");
-P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector,
-               0,
-               "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver,
-              false,
-              "load and save parameters in pserver. "
-              "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size,
-               1,
-               "Beam size used in generating most probable output sequences.");
+DEFINE_bool(parallel_nn,
+            false,
+            "Whether to use multi-threads to calculate one neural network."
+            "If it was set false, use gpu_id specify which gpu core to use"
+            "(the device property in the trainer config file will be ingored)."
+            "If it was set true, the gpu core is specified by the trainer"
+            "  config file(gpu_id will be ignored).");
+DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
+DEFINE_int32(gpu_id, 0, "Which gpu core to use");
+DEFINE_int32(port, 20134, "Listening port for pserver");
+DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
+DEFINE_int32(ports_num,
+             1,
+             "The ports number for parameter send,"
+             " increment based on default port number");
+DEFINE_int32(ports_num_for_sparse,
+             0,
+             "The ports number for parameter send,"
+             " increment based on default (port + ports_num)");
+DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
+DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
+DEFINE_int32(trainer_id,
+             0,
+             "For distributed training, each trainer must be given an unique id"
+             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
+             " trainer");
+DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
+DEFINE_string(comment, "", "A string for commenting this training task");
+DEFINE_string(load_missing_parameter_strategy,
+              "fail",
+              "which operation to take on load model fails. support "
+              "fail/rand/zero only.");
+DEFINE_int32(log_period, 100, "Log progress every so many batches");
+DEFINE_int32(log_period_server,
+             500,
+             "Log progress every so many batches at pserver end");
+DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
+DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
+DEFINE_bool(loadsave_parameters_in_pserver,
+            false,
+            "load and save parameters in pserver. "
+            "only work while parameter set sparse_remote_update.");
+DEFINE_int32(beam_size,
+             1,
+             "Beam size used in generating most probable output sequences.");
 
-P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-P_DEFINE_string(predict_file, "", "File name for saving predict result");
-P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path,
-                "",
-                "Path of the initial model parameters."
-                "If it was set, start_pass will be ignored.");
+DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
+DEFINE_string(predict_file, "", "File name for saving predict result");
+DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
+DEFINE_string(init_model_path,
+              "",
+              "Path of the initial model parameters."
+              "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 922533d63e..2ebbcb24eb 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -16,28 +16,28 @@ limitations under the License. */
 
 #include "CommandLineParser.h"
 
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_int32(async_count);
-P_DECLARE_int32(port);
-P_DECLARE_int32(data_server_port);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(trainer_count);
-P_DECLARE_int32(ports_num);
-P_DECLARE_int32(ports_num_for_sparse);
-P_DECLARE_string(nics);
-P_DECLARE_string(rdma_tcp);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_string(comment);
-P_DECLARE_string(load_missing_parameter_strategy);
-P_DECLARE_int32(log_period);
-P_DECLARE_int32(log_period_server);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_int32(enable_parallel_vector);
-P_DECLARE_bool(loadsave_parameters_in_pserver);
-P_DECLARE_int32(beam_size);
-P_DECLARE_bool(show_layer_stat);
-P_DECLARE_string(predict_file);
-P_DECLARE_bool(prev_batch_state);
-P_DECLARE_string(init_model_path);
+DECLARE_bool(parallel_nn);
+DECLARE_int32(async_count);
+DECLARE_int32(port);
+DECLARE_int32(data_server_port);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_int32(trainer_count);
+DECLARE_int32(ports_num);
+DECLARE_int32(ports_num_for_sparse);
+DECLARE_string(nics);
+DECLARE_string(rdma_tcp);
+DECLARE_int32(trainer_id);
+DECLARE_int32(num_gradient_servers);
+DECLARE_string(comment);
+DECLARE_string(load_missing_parameter_strategy);
+DECLARE_int32(log_period);
+DECLARE_int32(log_period_server);
+DECLARE_double(checkgrad_eps);
+DECLARE_int32(enable_parallel_vector);
+DECLARE_bool(loadsave_parameters_in_pserver);
+DECLARE_int32(beam_size);
+DECLARE_bool(show_layer_stat);
+DECLARE_string(predict_file);
+DECLARE_bool(prev_batch_state);
+DECLARE_string(init_model_path);
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 20f32466a5..5a1c6ecb22 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,175 +18,9 @@ limitations under the License. */
  */
 
 #include "Logging.h"
-#ifndef PADDLE_USE_GLOG
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
 
 namespace paddle {
 
-namespace internal {
-
-std::string join(const std::string& part1, const std::string& part2) {
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline bool env2bool(const char* envName, bool defaultValue = false) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    return memchr("tTyY1\0", envValue[0], 6) != nullptr;
-  }
-}
-
-static inline int env2int(const char* envName, int defaultValue = 0) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    int retValue = defaultValue;
-    try {
-      retValue = std::stoi(envValue);
-    } catch (...) {
-      // pass
-    }
-    return retValue;
-  }
-}
-
-static inline int env2index(const char* envName,
-                            const std::vector<std::string>& options,
-                            int defaultValue) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    for (size_t i = 0; i < options.size(); ++i) {
-      if (options[i] == envValue) {
-        return static_cast<int>(i);
-      }
-    }
-    return defaultValue;
-  }
-}
-
-static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {
-    "INFO", "WARNING", "ERROR", "FATAL"};
-static int gMinLogLevel =
-    env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
-
-static std::vector<std::vector<int>> gLogFds;
-static std::vector<int> gLogFileFds;
-static bool gLogInited = false;
-static void freeLogFileFds() {
-  for (auto fd : gLogFileFds) {
-    close(fd);
-  }
-}
-
-static void initializeLogFds(char* argv0) {
-  gLogFds.resize(NUM_SEVERITIES);
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && gLogToStderr;
-       ++i) {  // Add stderr
-    std::vector<int>& fds = gLogFds[i];
-    fds.push_back(STDERR_FILENO);
-  }
-
-  char* logDir = getenv("PLOG_LOGDIR");
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && logDir != nullptr; ++i) {
-    std::string filename =
-        join(logDir, std::string(argv0) + "." + gLevelName[i]);
-    int fd = open(filename.c_str(), O_CREAT | O_WRONLY, 0644);
-    if (fd == -1) {
-      fprintf(stderr, "Open log file error!");
-      exit(1);
-    }
-    gLogFileFds.push_back(fd);
-
-    std::vector<int>& curFds = gLogFds[i];
-    curFds.insert(curFds.end(), gLogFileFds.begin(), gLogFileFds.end());
-  }
-
-  atexit(freeLogFileFds);
-  gLogInited = true;
-}
-
-static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
-
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
-LogMessage::~LogMessage() { this->generateLogMessage(); }
-
-void LogMessage::generateLogMessage() {
-  if (!gLogInited) {
-    fprintf(stderr,
-            "%c %s:%d] %s\n",
-            "IWEF"[severity_],
-            fname_,
-            line_,
-            str().c_str());
-  } else {
-    for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd,
-              "%c %s:%d] %s\n",
-              "IWEF"[severity_],
-              fname_,
-              line_,
-              str().c_str());
-    }
-  }
-}
-
-LogMessageFatal::LogMessageFatal(const char* file, int line)
-    : LogMessage(file, line, FATAL) {}
-
-LogMessageFatal::~LogMessageFatal() {
-  generateLogMessage();
-  gFailureFunctionPtr();
-}
-}  // namespace internal
-
-void initializeLogging(int argc, char** argv) {
-  internal::initializeLogFds(argv[0]);
-}
-
-namespace logging {
-void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
-
-void installFailureFunction(void (*callback)() ATTR_NORETURN) {
-  paddle::internal::gFailureFunctionPtr = callback;
-}
-
-}  // namespace logging
-
-}  // namespace paddle
-
-#else
-namespace paddle {
 void initializeLogging(int argc, char** argv) {
   (void)(argc);
   if (!getenv("GLOG_logtostderr")) {
@@ -197,13 +31,16 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
+
 void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
+
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
+
 void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
+
 }  // namespace logging
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 4379289f6d..d9e551f089 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -22,175 +22,21 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 
-#ifndef PADDLE_USE_GLOG
-#include "CompilerMacros.h"
-
-//! TODO(yuyang18): Move this utility macro into some global header.
-#define PP_CAT(a, b) PP_CAT_I(a, b)
-#define PP_CAT_I(a, b) PP_CAT_II(~, a##b)
-#define PP_CAT_II(p, res) res
-
-/**
- * Generate Unique Variable Name, Usefully in macro.
- * @SEE
- * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
- */
-#define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
-
+#include <glog/logging.h>
 namespace paddle {
 
-//! Log levels.
-const int INFO = 0;
-const int WARNING = 1;
-const int ERROR = 2;
-const int FATAL = 3;
-const int NUM_SEVERITIES = 4;
-
-namespace internal {
-
-class LogMessage : public std::basic_ostringstream<char> {
-public:
-  LogMessage(const char* fname, int line, int severity);
-  ~LogMessage();
-
-protected:
-  /**
-   * @brief Print log message to stderr, files, etc.
-   */
-  void generateLogMessage();
-
-private:
-  const char* fname_;
-  int line_;
-  int severity_;
-};
-
-// LogMessageFatal ensures the process will exit in failure after
-// logging this message.
-class LogMessageFatal : public LogMessage {
-public:
-  LogMessageFatal(const char* file, int line) __attribute__((cold));
-  ~LogMessageFatal() __attribute__((noreturn));
-};
-
-#define _P_LOG_INFO \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::INFO)
-#define _P_LOG_WARNING \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::WARNING)
-#define _P_LOG_ERROR \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::ERROR)
-#define _P_LOG_FATAL ::paddle::internal::LogMessageFatal(__FILE__, __LINE__)
-
-#define P_LOG(severity) _P_LOG_##severity
-
-#define P_LOG_FIRST_N(severity, n)                                       \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                           \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) ++UNIQUE_NAME(LOG_OCCURRENCES); \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) P_LOG(severity)
-
-#define P_LOG_IF_EVERY_N(severity, condition, n)                              \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                                \
-  if (condition && ((UNIQUE_NAME(LOG_OCCURRENCES) =                           \
-                         (UNIQUE_NAME(LOG_OCCURRENCES) + 1) % n) == (1 % n))) \
-  P_LOG(severity)
-
-#define P_LOG_EVERY_N(severity, n) P_LOG_IF_EVERY_N(severity, true, n)
-
-// TODO(jeff): Define a proper implementation of VLOG_IS_ON
-#define P_VLOG_IS_ON(lvl) ((lvl) <= 0)
-
-#define P_LOG_IF(severity, condition) \
-  if (condition) P_LOG(severity)
-
-#define P_VLOG(lvl) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl))
-
-#define P_VLOG_IF(lvl, cond) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl) && cond)
-
-#define P_VLOG_EVERY_N(lvl, n) P_LOG_IF_EVERY_N(INFO, P_VLOG_IS_ON(lvl), n)
-
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
-#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
-
-// CHECK dies with a fatal error if condition is not true.  It is *not*
-// controlled by NDEBUG, so the check will be executed regardless of
-// compilation mode.  Therefore, it is safe to do things like:
-//    CHECK(fp->Write(x) == 4)
-#define P_CHECK(condition)         \
-  if (PREDICT_FALSE(!(condition))) \
-  P_LOG(FATAL) << "Check failed: " #condition " "
-
-#define P_CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define P_CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define P_CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define P_CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define P_CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define P_CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define P_CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-
-//! GLOG compatible APIs
-//! NOTE: only implement Paddle actually used APIs.
-#define LOG(x) P_LOG(x)
-#define VLOG(x) P_VLOG(x)
-#define DLOG(x) P_VLOG(5)
-#define CHECK(x) P_CHECK(x)
-#define PCHECK(x) P_CHECK(x)
-#define CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-#define VLOG_IS_ON(x) P_VLOG_IS_ON(x)
-#define LOG_FIRST_N(severity, n) P_LOG_FIRST_N(severity, n)
-#define LOG_IF(severity, condition) P_LOG_IF(severity, condition)
-#define VLOG_EVERY_N(lvl, n) P_VLOG_EVERY_N(lvl, n)
-#define VLOG_IF(lvl, cond) P_VLOG_IF(lvl, cond)
-#define LOG_EVERY_N(severity, n) P_LOG_EVERY_N(severity, n)
-}  //  namespace internal
-
-/**
- * @brief initialize logging
- * @note: Current implement of logging is lack of:
- *          PrintCallStack when fatal.
- *          VLOG_IS_ON
- *        But it is portable to multi-platform, and simple enough to modify.
- */
 void initializeLogging(int argc, char** argv);
-namespace logging {
-/**
- * @brief Set Min Log Level. if Log.level < minLogLevel, then will not print log
- *        to stream
- * @param level. Any integer is OK, but only 0 <= x <= NUM_SEVERITIES is useful.
- */
-void setMinLogLevel(int level);
-
-/**
- * @brief Install Log(Fatal) failure function. Default is abort();
- * @param callback: The failure function.
- */
-void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
-/**
- * @brief installFailureWriter
- * @note: not implemented currently.
- */
-inline void installFailureWriter(void (*callback)(const char*, int)) {
-  (void)(callback);  // unused callback.
-}
-}  //  namespace logging
-}  //  namespace paddle
-#else
-#include <glog/logging.h>
-namespace paddle {
-void initializeLogging(int argc, char** argv);
 namespace logging {
+
 void setMinLogLevel(int level);
+
 void installFailureFunction(void (*callback)());
+
 void installFailureWriter(void (*callback)(const char*, int));
-}  //  namespace logging
-}
-#endif  // PADDLE_USE_GLOG
+
+}  // namespace logging
+}  // namespace paddle
 
 #ifndef NDEBUG
 #define DEBUG_LEVEL 5
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 2ee4e4fb7e..7faeff55c2 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -20,8 +20,8 @@ namespace paddle {
 
 #ifdef PADDLE_NO_PYTHON
 
-P_DEFINE_string(python_path, "", "python path");
-P_DEFINE_string(python_bin, "python2.7", "python bin");
+DEFINE_string(python_path, "", "python path");
+DEFINE_string(python_bin, "python2.7", "python bin");
 
 constexpr int kExecuteCMDBufLength = 204800;
 
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 8a2878fc4b..75ccbd28cf 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "CommandLineParser.h"
 #include "Util.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed,
-              false,
-              "Whether to use global seed in thread local rand.");
+DEFINE_bool(thread_local_rand_use_global_seed,
+            false,
+            "Whether to use global seed in thread local rand.");
 
 namespace paddle {
 
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 26ff385c84..7c0d66c488 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "ThreadLocal.h"
 #include "Version.h"
 
-P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
+DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #ifdef WITH_GOOGLE_PERFTOOLS
 /*
@@ -52,10 +52,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #include <gperftools/profiler.h>
 
-P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file,
-                "gperf.prof",
-                "file for storing profile data");
+DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
+DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
   bool static started = false;
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index a9e351b69f..731c308421 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -18,13 +18,8 @@ limitations under the License. */
 #include <numeric>
 #include "Flags.h"
 #include "Util.h"
-//! TODO(yuyang18) in gflags, version has another define. Use another flag
-//! instead.
-#ifndef PADDLE_USE_GFLAGS
-P_DEFINE_bool(version, false, "print version");
-#else
-P_DECLARE_bool(version);
-#endif
+
+DECLARE_bool(version);
 
 namespace paddle {
 namespace version {
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 298ede5cd6..26fafbd1ab 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_simple_unittest(test_CommandLineParser)
-add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
deleted file mode 100644
index ed2b3068d5..0000000000
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_USE_GFLAGS
-//! Test Command Line Parser for paddle internal implement.
-
-#include <gtest/gtest.h>
-#include <paddle/utils/CommandLineParser.h>
-
-P_DEFINE_int32(i1, 1, "test int flag 1");
-P_DEFINE_int32(i2, 2, "test int flag 2");
-
-P_DEFINE_string(str1, "1", "test str flag 1");
-P_DEFINE_string(str2, "2", "test str flag 2");
-
-P_DEFINE_bool(b1, true, "test bool flag 1");
-P_DEFINE_bool(b2, false, "test bool flag 2");
-
-P_DEFINE_double(d1, 0.1, "test double flag 1");
-P_DEFINE_double(d2, -42.3, "test double flag 2");
-
-P_DEFINE_int64(l1, 1, "test int64 flag 1");
-P_DEFINE_int64(l2, 2, "test int64 flag 2");
-
-P_DEFINE_uint64(ul1, 32, "test uint64 flag 1");
-P_DEFINE_uint64(ul2, 33, "test uint64 flag 2");
-
-constexpr double EPSILON = 1e-5;
-
-#define cc(x) const_cast<char*>((x))
-
-TEST(CommandLineParser, defaultValue) {
-  char* argv[] = {cc("test_program"), cc("--unused_flag=134")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  paddle::ParseCommandLineFlags(&argc, argv);
-
-  // Check Default Value
-  ASSERT_EQ(argc, 2);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_str1, "1");
-  ASSERT_EQ(FLAGS_str2, "2");
-  ASSERT_EQ(FLAGS_b1, true);
-  ASSERT_EQ(FLAGS_b2, false);
-  ASSERT_NEAR(FLAGS_d1, 0.1, EPSILON);
-  ASSERT_NEAR(FLAGS_d2, -42.3, EPSILON);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_ul1, 32UL);
-  ASSERT_EQ(FLAGS_ul2, 33UL);
-}
-
-TEST(CommandLineParser, normal) {
-  char* argv[] = {cc("test_program"),
-                  cc("--i2=32"),
-                  cc("--str1=abc"),
-                  cc("--b2=1"),
-                  cc("-b1=False"),
-                  cc("--d2=.34"),
-                  cc("--d1=0"),
-                  cc("--l1=-12345678901234"),
-                  cc("-ul2=3212")};
-  int argc = sizeof(argv) / sizeof(char*);
-  paddle::ParseCommandLineFlags(&argc, argv);
-  ASSERT_EQ(argc, 1);
-  ASSERT_EQ(FLAGS_i2, 32);
-  ASSERT_EQ(FLAGS_str1, "abc");
-  ASSERT_EQ(FLAGS_b2, true);
-  ASSERT_EQ(FLAGS_b1, false);
-  ASSERT_NEAR(FLAGS_d2, 0.34, EPSILON);
-  ASSERT_NEAR(FLAGS_d1, 0.0, EPSILON);
-  ASSERT_EQ(FLAGS_l1, -12345678901234);
-  ASSERT_EQ(FLAGS_ul2, 3212UL);
-}
-
-TEST(CommandLineParser, printHelp) {
-  char* argv[] = {cc("test_program"), cc("--help")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  // Will Print Usage
-  ASSERT_DEATH(paddle::ParseCommandLineFlags(&argc, argv), ".*test_program.*");
-}
-
-TEST(CommandLineParser, parseError) {
-  char* argv[] = {cc("test_program"), cc("--i1=abc")};
-
-  int argc = sizeof(argv) / sizeof(char*);
-  ASSERT_DEATH(
-      paddle::ParseCommandLineFlags(&argc, argv),
-      "Parse command flag i1 error! User input is --i1=abc.*test_program.*");
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char** argv) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 292ed4619d..2ce1998376 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+DEFINE_int32(test_thread_num, 10, "testing thread number");
 
 void testNormalImpl(
     const std::function<void(paddle::CustomStackTrace<std::string>&,
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
deleted file mode 100644
index fbfffcc65a..0000000000
--- a/paddle/utils/tests/test_Logging.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include <dirent.h>
-#include <gtest/gtest.h>
-#include <stdlib.h>
-#include <fstream>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#ifndef PADDLE_USE_GLOG
-TEST(Logging, BasicalLog) {
-  auto pinfo = [] {
-    P_LOG(INFO) << "INFO";
-    exit(1);
-  };
-  ASSERT_DEATH(pinfo(), "I .*test_Logging.cpp:[0-9]+] INFO");
-
-  auto pwarn = [] {
-    P_LOG(WARNING) << "WARN";
-    exit(1);
-  };
-  ASSERT_DEATH(pwarn(), "W .*test_Logging.cpp:[0-9]+] WARN");
-
-  auto perr = [] {
-    P_LOG(ERROR) << "ERROR";
-    exit(1);
-  };
-  ASSERT_DEATH(perr(), "E .*test_Logging.cpp:[0-9]+] ERROR");
-
-  auto pfatal = [] { P_LOG(FATAL) << "FATAL"; };
-  ASSERT_DEATH(pfatal(), "F .*test_Logging.cpp:[0-9]+] FATAL");
-}
-
-TEST(Logging, Check) {
-  int a = 1;
-  int b = 2;
-  P_CHECK(a != b);
-
-  auto pcheckDown = [&] { P_CHECK(a == b); };
-  ASSERT_DEATH(pcheckDown(),
-               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
-
-  P_CHECK_LE(a, b);
-  P_CHECK_LT(a, b);
-  double t = 1.2;
-  P_CHECK_LE(a, t);
-  double* ptr = nullptr;
-
-  auto pcheckDown2 = [&] { P_CHECK_NOTNULL(ptr); };
-  ASSERT_DEATH(pcheckDown2(), "F");
-}
-
-#define cc(x) const_cast<char*>(x)
-
-TEST(Logging, LogToStderr) {
-  auto logToStderrCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-    P_LOG(INFO) << "This output will not print to std error";
-    exit(1);
-  };
-
-  ASSERT_DEATH(logToStderrCallback(), "");
-}
-
-constexpr char kLogDirName[] = "./test_log_dir";
-const std::vector<std::string> kLevels = {"INFO", "WARNING", "ERROR", "FATAL"};
-
-TEST(Logging, LogToDir) {
-  ASSERT_EQ(0, mkdir(kLogDirName, 0777));
-  auto logToDirCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    setenv("PLOG_LOGDIR", kLogDirName, true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-
-    P_LOG(INFO) << "INFO";
-    P_LOG(WARNING) << "WARNING";
-    P_LOG(ERROR) << "ERROR";
-    P_LOG(FATAL) << "FATAL";
-  };
-  ASSERT_DEATH(logToDirCallback(), "");
-
-  // There 4 file in logdir
-  auto dir = opendir(kLogDirName);
-  size_t fileCount = 0;
-  std::vector<std::string> filenames;
-  for (auto dirContent = readdir(dir); dirContent != nullptr;
-       dirContent = readdir(dir)) {
-    std::string filename(dirContent->d_name);
-    if (filename == "." || filename == "..") {
-      continue;
-    } else {
-      ++fileCount;
-      for (size_t i = 0; i < kLevels.size(); ++i) {
-        const std::string& curLevel = kLevels[i];
-        if (filename.size() > curLevel.length()) {
-          size_t diff = filename.size() - curLevel.length();
-          size_t j = 0;
-          for (; j < curLevel.length(); ++j) {
-            if (filename[j + diff] != curLevel[j]) {
-              // File Suffix Not Same, then break.
-              break;
-            }
-          }
-          if (j == curLevel.length()) {  // Same suffix.
-            std::ifstream fin;
-            auto fn = paddle::path::join(kLogDirName, filename);
-            fin.open(fn);
-            filenames.push_back(fn);
-            ASSERT_TRUE(fin.is_open());
-            size_t lineCounter = 0;
-            for (std::string line; std::getline(fin, line); ++lineCounter) {
-              // Do Nothing, Just calc lineCounter.
-            }
-
-            // For example.
-            // The info channel will have all log which level >= INFO
-            // So the info file's lineCounter should == 4.
-            ASSERT_EQ(kLevels.size() - i, lineCounter);
-            fin.close();
-          }
-        }
-      }
-    }
-  }
-  closedir(dir);
-  ASSERT_EQ(4UL, fileCount);  // 4 levels.
-  // Clean Unittest.
-  for (std::string& fn : filenames) {
-    ASSERT_EQ(remove(fn.c_str()), 0);
-  }
-  ASSERT_EQ(rmdir(kLogDirName), 0);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int, char**) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 22f8584ef5..8351e7e3ac 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 4a8af5b97e..60c2214ffd 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index b34e1ebded..552af71e76 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -245,7 +245,7 @@ message ImageConfig {
 
   // The size of input feature map.
   required uint32 img_size = 8;
-  required uint32 img_size_y = 9;
+  optional uint32 img_size_y = 9;
 }
 
 message LayerInputConfig {
diff --git a/third_party/gflags.BUILD b/third_party/gflags.BUILD
new file mode 100644
index 0000000000..85e8bd0bd7
--- /dev/null
+++ b/third_party/gflags.BUILD
@@ -0,0 +1,12 @@
+# Bazel (http://bazel.io/) BUILD file for gflags.
+#
+# See INSTALL.md for instructions for adding gflags to a Bazel workspace.
+
+licenses(["notice"])
+
+exports_files(["src/gflags_complections.sh", "COPYING.txt"])
+
+load(":bazel/gflags.bzl", "gflags_sources", "gflags_library")
+(hdrs, srcs) = gflags_sources(namespace=["google", "gflags"])
+gflags_library(hdrs=hdrs, srcs=srcs, threads=0)
+gflags_library(hdrs=hdrs, srcs=srcs, threads=1)
diff --git a/third_party/gflags_test/BUILD b/third_party/gflags_test/BUILD
new file mode 100644
index 0000000000..b50615203b
--- /dev/null
+++ b/third_party/gflags_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="gflags_test",
+    srcs=["gflags_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@gflags//:gflags",
+    ], )
diff --git a/third_party/gflags_test/gflags_test.cc b/third_party/gflags_test/gflags_test.cc
new file mode 100644
index 0000000000..53286e7e5b
--- /dev/null
+++ b/third_party/gflags_test/gflags_test.cc
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <string>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DEFINE_bool(verbose, false, "Display program name before message");
+DEFINE_string(message, "Hello world!", "Message to print");
+
+static bool IsNonEmptyMessage(const char *flagname, const std::string &value) {
+  return value[0] != '\0';
+}
+DEFINE_validator(message, &IsNonEmptyMessage);
+
+namespace third_party {
+namespace gflags_test {
+
+TEST(GflagsTest, ParseAndPrint) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::SetVersionString("1.0.0");
+  int argc = 1;
+  char program_name[] = "gflags_test";
+  char **argv = new char *[2];
+  argv[0] = program_name;
+  argv[1] = NULL;
+  gflags::ParseCommandLineFlags(&argc, reinterpret_cast<char ***>(&argv), true);
+  EXPECT_EQ("gflags_test", std::string(gflags::ProgramInvocationShortName()));
+  EXPECT_EQ("Hello world!", FLAGS_message);
+  gflags::ShutDownCommandLineFlags();
+}
+
+}  // namespace gflags_test
+}  // namespace third_party
diff --git a/third_party/glog.BUILD b/third_party/glog.BUILD
new file mode 100644
index 0000000000..a0ff1d6b41
--- /dev/null
+++ b/third_party/glog.BUILD
@@ -0,0 +1,128 @@
+licenses(["notice"])
+
+cc_library(
+    visibility=["//visibility:public"],
+    name="glog",
+    includes=[
+        ".",
+        "src",
+    ],
+    copts=[
+        "-D_START_GOOGLE_NAMESPACE_='namespace google {'",
+        "-D_END_GOOGLE_NAMESPACE_='}'",
+        "-DGOOGLE_NAMESPACE='google'",
+        "-DGOOGLE_GLOG_DLL_DECL=''",
+        "-DHAVE_DLADDR",
+        "-DHAVE_SNPRINTF",
+        "-DHAVE_DLFCN_H",
+        "-DHAVE_FCNTL",
+        "-DHAVE_GLOB_H",
+        "-DHAVE_INTTYPES_H",
+        "-DHAVE_LIBPTHREAD",
+        "-DHAVE_SYS_SYSCALL_H",
+        "-DHAVE_MEMORY_H",
+        "-DHAVE_NAMESPACES",
+        "-DHAVE_PREAD",
+        "-DHAVE_PTHREAD",
+        "-DHAVE_PWD_H",
+        "-DHAVE_PWRITE",
+        "-DHAVE_RWLOCK",
+        "-DHAVE_SIGACTION",
+        "-DHAVE_SIGALTSTACK",
+        "-DHAVE_STDINT_H",
+        "-DHAVE_STRING_H",
+        "-DHAVE_SYS_TIME_H",
+        "-DHAVE_SYS_TYPES_H",
+        "-DHAVE_SYS_UCONTEXT_H",
+        "-DHAVE_SYS_UTSNAME_H",
+        "-DHAVE_UNISTD_H",
+        "-DHAVE_USING_OPERATOR",
+        "-DHAVE_HAVE___ATTRIBUTE___",
+        "-DHAVE_HAVE___BUILTIN_EXPECT",
+        #"-DNO_FRAME_POINTER",
+        "-D_GNU_SOURCE",
+        #"-fno-sanitize=thread",
+        #"-fno-sanitize=address",
+        "-Iexternal/glog/src",
+    ],
+    srcs=[
+        "src/demangle.cc",
+        "src/logging.cc",
+        "src/raw_logging.cc",
+        "src/signalhandler.cc",
+        "src/symbolize.cc",
+        "src/utilities.cc",
+        "src/vlog_is_on.cc",
+        ":config_h",
+        ":logging_h",
+        ":raw_logging_h",
+        ":stl_logging_h",
+        ":vlog_is_on_h",
+    ],
+    hdrs=[
+        "src/demangle.h",
+        "src/mock-log.h",
+        "src/stacktrace.h",
+        "src/symbolize.h",
+        "src/utilities.h",
+        "src/base/commandlineflags.h",
+        "src/base/googleinit.h",
+        "src/base/mutex.h",
+        "src/glog/log_severity.h",
+    ])
+
+genrule(
+    name="config_h",
+    srcs=["src/config.h.cmake.in"],
+    outs=["config.h"],
+    cmd="awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $(<) > $(@)",
+)
+
+genrule(
+    name="logging_h",
+    srcs=["src/glog/logging.h.in"],
+    outs=["glog/logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="raw_logging_h",
+    srcs=["src/glog/raw_logging.h.in"],
+    outs=["glog/raw_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="stl_logging_h",
+    srcs=["src/glog/stl_logging.h.in"],
+    outs=["glog/stl_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="vlog_is_on_h",
+    srcs=["src/glog/vlog_is_on.h.in"],
+    outs=["glog/vlog_is_on.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="gen_sh",
+    outs=["gen.sh"],
+    cmd="""
+cat > $@ <<"EOF"
+#! /bin/sh
+sed -e 's/@ac_cv_have_unistd_h@/1/g' \
+    -e 's/@ac_cv_have_stdint_h@/1/g' \
+    -e 's/@ac_cv_have_systypes_h@/1/g' \
+    -e 's/@ac_cv_have_libgflags_h@/1/g' \
+    -e 's/@ac_cv_have_uint16_t@/1/g' \
+    -e 's/@ac_cv_have___builtin_expect@/1/g' \
+    -e 's/@ac_cv_have_.*@/0/g' \
+    -e 's/@ac_google_start_namespace@/namespace google {/g' \
+    -e 's/@ac_google_end_namespace@/}/g' \
+    -e 's/@ac_google_namespace@/google/g' \
+    -e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g' \
+    -e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g' \
+    -e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'
+EOF""")
diff --git a/third_party/glog_test/BUILD b/third_party/glog_test/BUILD
new file mode 100644
index 0000000000..56d08e95f8
--- /dev/null
+++ b/third_party/glog_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="glog_test",
+    srcs=["glog_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@glog//:glog",
+    ], )
diff --git a/third_party/glog_test/glog_test.cc b/third_party/glog_test/glog_test.cc
new file mode 100644
index 0000000000..f1d737d625
--- /dev/null
+++ b/third_party/glog_test/glog_test.cc
@@ -0,0 +1,7 @@
+#include <iostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(GlogTest, Logging) { LOG(INFO) << "Hello world"; }
diff --git a/third_party/gtest.BUILD b/third_party/gtest.BUILD
index 71c74af513..9255b51d9a 100644
--- a/third_party/gtest.BUILD
+++ b/third_party/gtest.BUILD
@@ -1,5 +1,5 @@
 cc_library(
-    name="main",
+    name="gtest",
     srcs=glob(
         ["src/*.cc"], exclude=["src/gtest-all.cc"]),
     hdrs=glob(["include/**/*.h", "src/*.h"]),
diff --git a/third_party/protobuf_test/BUILD b/third_party/protobuf_test/BUILD
index 95a687a356..67d4293c70 100644
--- a/third_party/protobuf_test/BUILD
+++ b/third_party/protobuf_test/BUILD
@@ -19,6 +19,6 @@ cc_test(
     srcs=["example_lib_test.cc"],
     copts=["-Iexternal/gtest/include"],
     deps=[
-        "@gtest//:main",
+        "@gtest//:gtest",
         ":example_lib",
     ], )