Fix conflict with develop

9 years ago · b090ce329a
parent 572d8254ea 5e9b19f46c
commit b090ce329a
87 changed files with 3315 additions and 1275 deletions
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -95,7 +95,6 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
-        paddle_test_main
        ${METRIC_LIBS}
        ${EXTERNAL_LIBS}
        ${CMAKE_THREAD_LIBS_INIT}
@ -130,8 +129,9 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
-        ${GTEST_LIBRARIES})
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
+                          ${GTEST_LIBRARIES})
 endfunction()

 # add_unittest_without_exec
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@ -14,9 +14,19 @@
 # limitations under the License.
 set -e
 set -x
+BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'

-# download the dictionary and pretrained model 
-for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
+          927cf70f27f860aff1a5703ebf7f1584
+	  a52e43655cd25d279777ed509a1ae27b
+	  b92c67fe9ff70fea53596080e351ac80)
+
+for ((i=0; i<${#ITEM_MD5[@]}; i++))
+do
+  FILENAME=${DOWNLOAD_ITEMS[${i}]}
+  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
+  EXPECTED_MD5=${ITEM_MD5[${i}]}
+  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
 done
--- a/demo/recommendation/evaluate.py
+++ b/demo/recommendation/evaluate.py
@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@ -72,7 +72,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 减少数据载入的耗时
 ++++++++++++++++++

-使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。

 ..  literalinclude:: src/reduce_min_pool_size.py
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
  :maxdepth: 1

+  rnn_config_cn.rst
  recurrent_group_cn.md
  hierarchical_layer_cn.rst
  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/rnn_cn.md
+++ b/doc/howto/deep_model/rnn/rnn_cn.md
@ -1,226 +0,0 @@
-RNN 配置
-=================
-
-本教程将指导你如何在 PaddlePaddle 中配置循环神经网络（RNN）。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-
-   准备用来学习循环神经网络的序列数据。
-   配置循环神经网络架构。
-   使用学习完成的循环神经网络模型生成序列。
-
-我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
-
-准备序列数据
---------------------
-
-PaddlePaddle 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。 它们都是序列，它们的大小是`src_dict`，`trg_dict`和`trg_dict`：
-
-``` sourceCode
-settings.input_types = [
-  integer_value_sequence(len(settings.src_dict)),
-  integer_value_sequence(len(settings.trg_dict)),
-  integer_value_sequence(len(settings.trg_dict))]
-```
-
-在`process`函数中，每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-``` sourceCode
-yield src_ids, trg_ids, trg_ids_next
-```
-
-有关如何编写数据提供程序的更多细节描述，请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。
-
-配置循环神经网络架构
-----------------------------------------------
-
-### 简单门控循环神经网络(Gated Recurrent Neural Network)
-
-循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-
-![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
-
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。
-
-*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
-
-其中 *f*<sub>*x*</sub>(.) 称为**单步函数**（即单时间步执行的函数，step function），而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用`grumemory`和`lstmemory`，因为它们的计算效率比`recurrent_group`更高。
-
-对于 vanilla RNN，在每个时间步长，**单步函数**为：
-
-*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
-
-其中 *x*<sub>*t*</sub> 是RNN状态，并且 *I*<sub>*t*</sub> 是输入，*W*<sub>*x*</sub> 和 *W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
-
-`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**，**输出函数**和循环神经网络的输入。注意，这个函数的`step`参数需要实现`step function`（单步函数）和`output function`（输出函数）：
-
-
-``` sourceCode
-def simple_rnn(input,
-               size=None,
-               name=None,
-               reverse=False,
-               rnn_bias_attr=None,
-               act=None,
-               rnn_layer_attr=None):
-    def __rnn_step__(ipt):
-       out_mem = memory(name=name, size=size)
-       rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                      full_matrix_projection(out_mem)],
-                             name = name,
-                             bias_attr = rnn_bias_attr,
-                             act = act,
-                             layer_attr = rnn_layer_attr,
-                             size = size)
-       return rnn_out
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__rnn_step__,
-                           reverse=reverse,
-                           input=input)
-```
-
-PaddlePaddle 使用“Memory”（记忆模块）实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态，例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**，其输出被用作Memory的初始值。 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**。
-
-Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
-
-我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
-
-### Sequence to Sequence Model with Attention
-
-我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-
-![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
-
-在这个模型中，源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>。
-
-模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
-
-我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例，并将其投射到 `decoder_size` 维空间完成：
-
-``` sourceCode
-# 定义源语句的数据层
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-# 计算每个词的词向量
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-# 应用前向循环神经网络
-src_forward = grumemory(input=src_embedding, size=encoder_size)
-# 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-src_backward = grumemory(input=src_embedding,
-                          size=encoder_size,
-                          reverse=True)
-# 将循环神经网络的前向和反向部分混合在一起
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-# 投射编码向量到 decoder_size
-encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                           size = decoder_size)
-
-# 计算反向RNN的第一个实例
-backward_first = first_seq(input=src_backward)
-
-# 投射反向RNN的第一个实例到 decoder size
-decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
-```
-
-解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-trg_embedding = embedding_layer(
-    input=data_layer(name='target_language_word',
-                     size=target_dict_dim),
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_target_language_embedding'))
-group_inputs.append(trg_embedding)
-
-# 对于配备有注意力机制的解码器，在训练中，
-# 目标向量（groudtruth）是数据输入，
-# 而源序列的编码向量可以被无边界的memory访问
-# StaticInput 意味着不同时间步的输入都是相同的值，
-# 否则它以一个序列输入，不同时间步的输入是不同的。
-# 所有输入序列应该有相同的长度。
-decoder = recurrent_group(name=decoder_group_name,
-                          step=gru_decoder_with_attention,
-                          input=group_inputs)
-```
-
-单步函数的实现如下所示。首先，它定义解码网络的**Memory**。然后定义 attention，门控循环单元单步函数和输出函数：
-
-``` sourceCode
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    # 定义解码器的Memory
-    # Memory的输出定义在 gru_step 内
-    # 注意 gru_step 应该与它的Memory名字相同
-    decoder_mem = memory(name='gru_decoder',
-                         size=decoder_size,
-                         boot_layer=decoder_boot)
-    # 计算 attention 加权编码向量
-    context = simple_attention(encoded_sequence=enc_vec,
-                               encoded_proj=enc_proj,
-                               decoder_state=decoder_mem)
-    # 混合当前词向量和attention加权编码向量
-    decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                           full_matrix_projection(current_word)],
-                                 size = decoder_size * 3)
-    # 定义门控循环单元循环神经网络单步函数
-    gru_step = gru_step_layer(name='gru_decoder',
-                              input=decoder_inputs,
-                              output_mem=decoder_mem,
-                              size=decoder_size)
-    # 定义输出函数
-    out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                      size=target_dict_dim,
-                      bias_attr=True,
-                      act=SoftmaxActivation())
-    return out
-```
-
-生成序列
-----------------
-
-训练模型后，我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意，`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
-
-   使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
-   使用 `beam_search` 函数。这个函数需要设置：
-    -   `bos_id`: 开始标记。每个句子都以开始标记开头。
-    -   `eos_id`: 结束标记。每个句子都以结束标记结尾。
-    -   `beam_size`: beam search 算法中的beam大小。
-    -   `max_length`: 生成序列的最大长度。
-   使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置：
-    -   `id_input`: 数据的整数ID，用于标识生成的文件中的相应输出。
-    -   `dict_file`: 用于将词ID转换为词的字典文件。
-    -   `result_file`: 生成结果文件的路径。
-
-代码如下：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-# 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
-# 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
-# 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-trg_embedding = GeneratedInput(
-    size=target_dict_dim,
-    embedding_name='_target_language_embedding',
-    embedding_size=word_vector_dim)
-group_inputs.append(trg_embedding)
-beam_gen = beam_search(name=decoder_group_name,
-                       step=gru_decoder_with_attention,
-                       input=group_inputs,
-                       bos_id=0, # Beginnning token.
-                       eos_id=1, # End of sentence token.
-                       beam_size=beam_size,
-                       max_length=max_length)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=data_layer(name="sent_id", size=1),
-                          dict_file=trg_dict_path,
-                          result_file=gen_trans_file)
-outputs(beam_gen)
-```
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
-
-完整的配置文件在`demo/seqToseq/seqToseq_net.py`。
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@ -1,4 +1,4 @@
-RNN 配置
+RNN配置
 ========

 本教程将指导你如何在 PaddlePaddle
@ -20,7 +20,7 @@ PaddlePaddle
 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
 它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：

-.. code:: sourcecode
+.. code:: python

    settings.input_types = [
      integer_value_sequence(len(settings.src_dict)),
@ -29,12 +29,11 @@ PaddlePaddle

 在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：

-.. code:: sourcecode
+.. code:: python

    yield src_ids, trg_ids, trg_ids_next

-有关如何编写数据提供程序的更多细节描述，请参考
-`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
 ``demo/seqToseq/dataprovider.py``\ 。

 配置循环神经网络架构
@ -45,18 +44,17 @@ PaddlePaddle

 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。

-.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-   :alt: image
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center

-   image
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。

-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
-= 1 执行以下操作。
+.. math::

-*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)

-其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ （即单时间步执行的函数，step
-function），而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
 sequence
@ -67,16 +65,17 @@ vanilla

 对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：

-*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
+.. math::

-其中 *x*\ \ *t*\  是RNN状态，并且 *I*\ \ *t*\  是输入，\ *W*\ \ *x*\  和
-*W*\ \ *i*\  分别是RNN状态和输入的变换矩阵。\ *b*
-是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。

 ``recurrent_group``\ 是构建循环神经网络的最重要的工具。
 它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：

-.. code:: sourcecode
+.. code:: python

    def simple_rnn(input,
                   size=None,
@ -102,7 +101,7 @@ vanilla

 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
-Memory是在单步函数中循环使用的状态，例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
 一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
 layer(引导层)**\ ，其输出被用作Memory的初始值。
 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。

-.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
-   :alt: image
-
-   image
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center

-在这个模型中，源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
-*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
-*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
-这个门控循环神经网络生成一系列权重
-*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
-用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。

 模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
 ``recurrent_group``
-更快。我们已经实现了大多数常用的循环神经网络架构，可以参考
-`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
-了解更多细节。
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。

 我们还将编码向量投射到 ``decoder_size``
 维空间。这通过获得反向循环网络的第一个实例，并将其投射到
 ``decoder_size`` 维空间完成：

-.. code:: sourcecode
+.. code:: python

    # 定义源语句的数据层
    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：

-.. code:: sourcecode
+.. code:: python

    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                  StaticInput(input=encoded_proj,is_seq=True)]
@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：

-.. code:: sourcecode
+.. code:: python

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
        # 定义解码器的Memory
@ -253,7 +247,7 @@ attention，门控循环单元单步函数和输出函数：

 代码如下：

-.. code:: sourcecode
+.. code:: python

    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                  StaticInput(input=encoded_proj,is_seq=True)]
@ -279,9 +273,6 @@ attention，门控循环单元单步函数和输出函数：
                              result_file=gen_trans_file)
    outputs(beam_gen)

-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅
-`Semantic Role Labeling
-Demo <../../demo/semantic_role_labeling/index.html>`__
-了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。

 完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@ -7,10 +7,11 @@
 ..  toctree::
  :maxdepth: 1

+  usage/cmd_parameter/index_cn.rst
  usage/concepts/use_concepts_cn.rst
  usage/cluster/cluster_train_cn.md
-  usage/cluster/k8s/k8s_cn.md
-  usage/cluster/k8s/k8s_distributed_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md

 开发标准
 --------
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@ -7,8 +7,10 @@ Usage
 ..  toctree::
  :maxdepth: 1

-  usage/cmd_parameter/index_en.md
+  usage/cmd_parameter/index_en.rst
  usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md

 Development
 ------------
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@ -73,7 +73,7 @@
  - type: bool (default: 0).

 * `--load_missing_parameter_strategy`
-  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
    - `fail`: program will exit.
    - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
    - `zero`: all parameters are zero.
@ -118,11 +118,11 @@
  - type: int32 (default: 0).

 * `--test_wait`
-  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
  - type: bool (default: 0).

 * `--model_list`
-  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - File that saves the model list when testing. 
  - type: string (default: "", null).

 * `--predict_output_dir`
@ -212,7 +212,7 @@
  - type: bool (default: 0).

 * `--pservers`
-  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - Comma separated IP addresses of pservers.
  - type: string (default: "127.0.0.1").

 * `--port`
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
--- a/doc/howto/usage/cmd_parameter/index_en.md
+++ b/doc/howto/usage/cmd_parameter/index_en.md
@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index:
-```
-# Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
--- a/doc/howto/usage/cmd_parameter/index_en.rst
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
--- a/doc/howto/usage/cmd_parameter/use_case_cn.md
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
--- a/doc/howto/usage/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@ -134,14 +134,14 @@ fc2=fc_layer(...)
 fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))

 ```
- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.

- device=-1: use the CPU for layer l3.
+- device=-1: use the CPU for layer fc3.

 - trainer_count:
-  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.

-  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.

 ### Case 2: Specify Layers in Different Devices

@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
 In this case, we assume that there are 4 GPUs in one machine.

 - trainer_count=1:
-  - Use GPU 0 to compute layer l2.
-  - Use GPU 1 to compute layer l3.
-  - Use CPU to compute layer l4.
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.

 - trainer_count=2:
-  - Use GPU 0 and 1 to compute layer l2.
-  - Use GPU 2 and 3 to compute layer l3.
-  - Use CPU to compute l4 in two threads.
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.

 - trainer_count=4:
  - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
--- a/doc/howto/usage/cluster/k8s/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s/k8s_cn.md
@ -1,4 +1,4 @@
-# Kubernetes 单机训练
+# Kubernetes单机训练

 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。

--- a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
@ -1,4 +1,4 @@
-# Kubernetes 分布式训练
+# Kubernetes分布式训练

 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。

@ -22,7 +22,7 @@

 首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：

-![paddle on kubernetes结构图](k8s-paddle-arch.png)
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)

 上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。

--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/k8s/k8s_en.md
@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
--- a/doc/howto/usage/cluster/k8s/Dockerfile
+++ b/doc/howto/usage/cluster/k8s/Dockerfile
--- a/doc/howto/usage/k8s/src/add_security_group.png
+++ b/doc/howto/usage/k8s/src/add_security_group.png
--- a/doc/howto/usage/k8s/src/create_efs.png
+++ b/doc/howto/usage/k8s/src/create_efs.png
--- a/doc/howto/usage/k8s/src/efs_mount.png
+++ b/doc/howto/usage/k8s/src/efs_mount.png
--- a/Show More
+++ b/Show More