Merge conflict with develop branch

8 years ago · ec8e2108a4
parent 1164c287b9 a2a5f4af81
commit ec8e2108a4
34 changed files with 4111 additions and 343 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -57,7 +57,7 @@ before_install:
  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
  # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
  - paddle/scripts/travis/main.sh
 notifications:
--- a/demo/mnist/api_train_v2.py
+++ b/demo/mnist/api_train_v2.py
@ -20,26 +20,42 @@ def main():
    adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01)
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=adam_optimizer)
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
+            if event.batch_id % 1000 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
+                result = trainer.test(reader=paddle.reader.batched(
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
+                    paddle.dataset.mnist.test(), batch_size=256))
                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    result.metrics)
        else:
            pass
    trainer = paddle.trainer.SGD(update_equation=adam_optimizer)
    trainer.train(
        reader=paddle.reader.batched(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
            batch_size=32),
-        cost=cost,
+        event_handler=event_handler)
    # output is a softmax layer. It returns probabilities.
    # Shape should be (100, 10)
    probs = paddle.infer(
        output=inference,
        parameters=parameters,
-        event_handler=event_handler,
+        reader=paddle.reader.batched(
-        reader_dict={images.name: 0,
+            paddle.reader.firstn(
-                     label.name: 1})
+                paddle.reader.map_readers(lambda item: (item[0], ),
                                          paddle.dataset.mnist.test()),
                n=100),
            batch_size=32))
    print probs.shape
 if __name__ == '__main__':
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@ -10,6 +10,7 @@
  usage/cmd_parameter/index_cn.rst
  usage/concepts/use_concepts_cn.rst
  usage/cluster/cluster_train_cn.md
  usage/k8s/k8s_basis_cn.md
  usage/k8s/k8s_cn.md
  usage/k8s/k8s_distributed_cn.md
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@ -6,7 +6,7 @@
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 ## 前提条件
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@ -2,7 +2,7 @@
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
 ## Prerequisite
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@ -0,0 +1,75 @@
 # Kubernetes 简介
 [*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
 - [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
 - [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
 - [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
 - [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
 - [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
 - [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
 # 部署Kubernetes集群
 Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
 - [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
 - [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
 - [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
 - [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
 可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
 # 选择存储方案
 容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
 常见的可选存储服务包括：
 - [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
 - [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
 - [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
 - [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
 # 配置kubectl
 ## 安装kubectl
 ```
 # OS X
 curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
 # Linux
 curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
 # Windows
 curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
 ```
 ## 配置kubectl访问你的kubernetes集群
 编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
 ```
 apiVersion: v1
 clusters:
 - cluster:
    certificate-authority: /path/to/ca.crt
    server: https://[Master-IP]:443
  name: minikube
 contexts:
 - context:
    cluster: minikube
    user: minikube
  name: minikube
 current-context: minikube
 kind: Config
 preferences: {}
 users:
 - name: minikube
  user:
    client-certificate: /path/to/apiserver.crt
    client-key: /Users/wuyi/.minikube/apiserver.key
 ```
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
--- a/doc/howto/usage/k8s/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 /// Return FLAGS_trainer_count
 int getTrainerCount();
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@ -54,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }
 int getTrainerCount() { return FLAGS_trainer_count; }
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
              "The Parameter Type should be same in core/api and core/common");
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@ -26,6 +26,15 @@ class IScanner(object):
        if not isinstance(self.input_type, dp2.InputType):
            raise ValueError("input type should be dataprovider2.InputType")
        self.pos = pos
        # data_in_gpu is used to indicate whether to create argument on GPU
        # or not in GPU mode. Now if using one thread (trainer_count=1),
        # trainer uses NeuralNetwork which needs to create argument on GPU
        # before calling forward function. So, set data_in_gpu to True.
        # Otherwise, trainer uses MultiGradientMachine which will transfer
        # data from CPU to GPU in the forward function, set data_in_gpu to
        # False in this case.
        self.data_in_gpu = swig_paddle.isUsingGpu(
        ) and swig_paddle.getTrainerCount() == 1
    def scan(self, dat):
        pass
@ -53,7 +62,8 @@ class DenseScanner(IScanner):
        assert isinstance(argument, swig_paddle.Arguments)
        if self.__mat__.dtype != numpy.float32:
            self.__mat__ = self.__mat__.astype(numpy.float32)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
                                                    self.data_in_gpu)
        argument.setSlotValue(self.pos, m)
@ -75,10 +85,13 @@ class SparseBinaryScanner(IScanner):
    def finish_scan(self, argument):
        assert isinstance(argument, swig_paddle.Arguments)
-        m = swig_paddle.Matrix.createSparse(self.__height__,
+        m = swig_paddle.Matrix.createSparse(
            self.__height__,
            self.input_type.dim,
            len(self.__cols__),
-                                            len(self.__value__) == 0)
+            len(self.__value__) == 0,
            False,  # trans
            False)  # TODO supoort GPU
        assert isinstance(m, swig_paddle.Matrix)
        m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
        argument.setSlotValue(self.pos, m)
@ -102,7 +115,7 @@ class IndexScanner(IScanner):
        self.__ids__.append(dat)
    def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__)
+        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
        assert isinstance(argument, swig_paddle.Arguments)
        argument.setSlotIds(self.pos, ids)
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@ -5,38 +5,50 @@ ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG BUILD_WOBOQ
 ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
 ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y git python-pip python-dev openssh-server bison && \
    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake && \
+    apt-get install -y automake locales clang-format-3.8 && \
    apt-get clean -y
 # git credential to skip password typing
 RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 RUN pip install --upgrade pip && \
-    pip install -U "protobuf==3.1.0" && \
+    pip install -U 'protobuf==3.1.0' && \
    pip install -U wheel pillow BeautifulSoup && \
    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx_rtd_theme recommonmark jupyter
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
    pip install -U pre-commit 'requests==2.9.2' jupyter
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
    cd .. && rm -rf cmake-3.4.1
 ARG BUILD_WOBOQ
 ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
 ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
@ -53,7 +65,6 @@ RUN mkdir /notes/
 WORKDIR "/notes"
 EXPOSE 8888
 RUN mkdir -p /opt/bin
 COPY ./paddle/scripts/docker/entrypoint /opt/bin/
 CMD ["/opt/bin/entrypoint"]
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@ -5,38 +5,50 @@ ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG BUILD_WOBOQ
 ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
 ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y git python-pip python-dev openssh-server bison && \
    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake && \
+    apt-get install -y automake locales clang-format-3.8 && \
    apt-get clean -y
 # git credential to skip password typing
 RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 RUN pip install --upgrade pip && \
-    pip install -U "protobuf==3.1.0" && \
+    pip install -U 'protobuf==3.1.0' && \
    pip install -U wheel pillow BeautifulSoup && \
    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx_rtd_theme recommonmark jupyter
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
    pip install -U pre-commit 'requests==2.9.2' jupyter
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
    cd .. && rm -rf cmake-3.4.1
 ARG BUILD_WOBOQ
 ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
 ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
@ -53,7 +65,6 @@ RUN mkdir /notes/
 WORKDIR "/notes"
 EXPOSE 8888
 RUN mkdir -p /opt/bin
 COPY ./paddle/scripts/docker/entrypoint /opt/bin/
 CMD ["/opt/bin/entrypoint"]
--- a/paddle/scripts/docker/root/.bashrc
+++ b/paddle/scripts/docker/root/.bashrc
@ -0,0 +1,46 @@
 # Locales
 export LC_ALL=en_US.UTF-8
 export LANG=en_US.UTF-8
 export LANGUAGE=en_US.UTF-8
 # Aliases
 alias rm='rm -i'
 alias cp='cp -i'
 alias mv='mv -i'
 alias ls='ls -hFG'
 alias l='ls -lF'
 alias ll='ls -alF'
 alias lt='ls -ltrF'
 alias ll='ls -alF'
 alias lls='ls -alSrF'
 alias llt='ls -altrF'
 # Colorize directory listing
 alias ls="ls -ph --color=auto"
 # Colorize grep
 if echo hello|grep --color=auto l >/dev/null 2>&1; then
  export GREP_OPTIONS="--color=auto" GREP_COLOR="1;31"
 fi
 # Shell
 export CLICOLOR="1"
 YELLOW="\[\033[1;33m\]"
 NO_COLOUR="\[\033[0m\]"
 GREEN="\[\033[1;32m\]"
 WHITE="\[\033[1;37m\]"
 source ~/.scripts/git-prompt.sh
 export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
 # Git
 source ~/.scripts/git-completion.sh
--- a/paddle/scripts/docker/root/.gitconfig
+++ b/paddle/scripts/docker/root/.gitconfig
@ -0,0 +1,43 @@
 [user]
  name =
  email =
 [alias]
  st = status --branch --short
  ci = commit
  br = branch
  co = checkout
  df = diff
  l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
  ll = log --stat
 [merge]
  tool = vimdiff
 [core]
  excludesfile = ~/.gitignore
  editor = vim
 [color]
  branch = auto
  diff = auto
  status = auto
 [color "branch"]
  current = yellow reverse
  local = yellow
  remote = green
 [color "diff"]
  meta = yellow bold
  frag = magenta bold
  old = red bold
  new = green bold
 [color "status"]
  added = yellow
  changed = green
  untracked = cyan
 [push]
  default = matching
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
--- a/paddle/scripts/docker/root/.scripts/git-prompt.sh
+++ b/paddle/scripts/docker/root/.scripts/git-prompt.sh
--- a/python/paddle/v2/init.py
+++ b/python/paddle/v2/init.py
@ -24,12 +24,14 @@ from . import dataset
 from . import reader
 import attr
 import pooling
 import inferencer
 import networks
 import py_paddle.swig_paddle as api
 __all__ = [
    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
-    'topology'
+    'topology', 'networks', 'inferencer', 'infer'
 ]
@ -39,3 +41,6 @@ def init(**kwargs):
        args.append('--%s=%s' % (key, str(kwargs[key])))
    api.initPaddle(*args)
 infer = inferencer.infer
--- a/python/paddle/v2/config_base.py
+++ b/python/paddle/v2/config_base.py
@ -0,0 +1,86 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 from paddle.trainer_config_helpers.default_decorators import wrap_name_default
 import paddle.trainer_config_helpers as conf_helps
 class Layer(object):
    def __init__(self, name=None, parent_layers=None):
        assert isinstance(parent_layers, dict)
        self.name = name
        self.__parent_layers__ = parent_layers
    def to_proto(self, context):
        """
        function to set proto attribute
        """
        kwargs = dict()
        for layer_name in self.__parent_layers__:
            if not isinstance(self.__parent_layers__[layer_name],
                              collections.Sequence):
                v1_layer = self.__parent_layers__[layer_name].to_proto(
                    context=context)
            else:
                v1_layer = map(lambda x: x.to_proto(context=context),
                               self.__parent_layers__[layer_name])
            kwargs[layer_name] = v1_layer
        if self.name is None:
            return self.to_proto_impl(**kwargs)
        elif self.name not in context:
            context[self.name] = self.to_proto_impl(**kwargs)
        return context[self.name]
    def to_proto_impl(self, **kwargs):
        raise NotImplementedError()
 def __convert_to_v2__(method_name, parent_names, is_default_name=True):
    if is_default_name:
        wrapper = wrap_name_default(name_prefix=method_name)
    else:
        wrapper = None
    class V2LayerImpl(Layer):
        def __init__(self, **kwargs):
            parent_layers = dict()
            other_kwargs = dict()
            for pname in parent_names:
                if kwargs.has_key(pname):
                    parent_layers[pname] = kwargs[pname]
            for key in kwargs.keys():
                if key not in parent_names:
                    other_kwargs[key] = kwargs[key]
            name = kwargs.get('name', None)
            super(V2LayerImpl, self).__init__(name, parent_layers)
            self.__other_kwargs__ = other_kwargs
        if wrapper is not None:
            __init__ = wrapper(__init__)
        def to_proto_impl(self, **kwargs):
            args = dict()
            for each in kwargs:
                args[each] = kwargs[each]
            for each in self.__other_kwargs__:
                args[each] = self.__other_kwargs__[each]
            return getattr(conf_helps, method_name)(**args)
    return V2LayerImpl
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@ -32,3 +32,10 @@ def download(url, module_name, md5sum):
            shutil.copyfileobj(r.raw, f)
    return filename
 def dict_add(a_dict, ele):
    if ele in a_dict:
        a_dict[ele] += 1
    else:
        a_dict[ele] = 1
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@ -0,0 +1,120 @@
 # /usr/bin/env python
 # -*- coding:utf-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
 """
 import paddle.v2.dataset.common
 import tarfile
 import Queue
 import re
 import string
 import threading
 __all__ = ['build_dict', 'train', 'test']
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
 # Read files that match pattern.  Tokenize and yield each file.
 def tokenize(pattern):
    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
                                                        MD5)) as tarf:
        # Note that we should use tarfile.next(), which does
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                    None, string.punctuation).lower().split()
            tf = tarf.next()
 def build_dict(pattern, cutoff):
    word_freq = {}
    for doc in tokenize(pattern):
        for word in doc:
            paddle.v2.dataset.common.dict_add(word_freq, word)
    # Not sure if we should prune less-frequent words here.
    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
    word_idx = dict(zip(words, xrange(len(words))))
    word_idx['<unk>'] = len(words)
    return word_idx
 def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
    UNK = word_idx['<unk>']
    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
    def load(pattern, queue):
        for doc in tokenize(pattern):
            queue.put(doc)
        queue.put(None)
    def reader():
        # Creates two threads that loads positive and negative samples
        # into qs.
        t0 = threading.Thread(
            target=load, args=(
                pos_pattern,
                qs[0], ))
        t0.daemon = True
        t0.start()
        t1 = threading.Thread(
            target=load, args=(
                neg_pattern,
                qs[1], ))
        t1.daemon = True
        t1.start()
        # Read alternatively from qs[0] and qs[1].
        i = 0
        doc = qs[i].get()
        while doc != None:
            yield [word_idx.get(w, UNK) for w in doc], i % 2
            i += 1
            doc = qs[i % 2].get()
        # If any queue is empty, reads from the other queue.
        i += 1
        doc = qs[i % 2].get()
        while doc != None:
            yield [word_idx.get(w, UNK) for w in doc], i % 2
            doc = qs[i % 2].get()
    return reader()
 def train(word_idx):
    return reader_creator(
        re.compile("aclImdb/train/pos/.*\.txt$"),
        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
 def test(word_idx):
    return reader_creator(
        re.compile("aclImdb/test/pos/.*\.txt$"),
        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@ -0,0 +1,79 @@
 """
 imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
 """
 import paddle.v2.dataset.common
 import tarfile
 __all__ = ['train', 'test']
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
 def word_count(f, word_freq=None):
    add = paddle.v2.dataset.common.dict_add
    if word_freq == None:
        word_freq = {}
    for l in f:
        for w in l.strip().split():
            add(word_freq, w)
        add(word_freq, '<s>')
        add(word_freq, '<e>')
    return word_freq
 def build_dict(train_filename, test_filename):
    with tarfile.open(
            paddle.v2.dataset.common.download(
                paddle.v2.dataset.imikolov.URL, 'imikolov',
                paddle.v2.dataset.imikolov.MD5)) as tf:
        trainf = tf.extractfile(train_filename)
        testf = tf.extractfile(test_filename)
        word_freq = word_count(testf, word_count(trainf))
        TYPO_FREQ = 50
        word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
        dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*dictionary))
        word_idx = dict(zip(words, xrange(len(words))))
        word_idx['<unk>'] = len(words)
    return word_idx
 word_idx = {}
 def reader_creator(filename, n):
    global word_idx
    if len(word_idx) == 0:
        word_idx = build_dict('./simple-examples/data/ptb.train.txt',
                              './simple-examples/data/ptb.valid.txt')
    def reader():
        with tarfile.open(
                paddle.v2.dataset.common.download(
                    paddle.v2.dataset.imikolov.URL, 'imikolov',
                    paddle.v2.dataset.imikolov.MD5)) as tf:
            f = tf.extractfile(filename)
            UNK = word_idx['<unk>']
            for l in f:
                l = ['<s>'] + l.strip().split() + ['<e>']
                if len(l) >= n:
                    l = [word_idx.get(w, UNK) for w in l]
                    for i in range(n, len(l) + 1):
                        yield tuple(l[i - n:i])
    return reader
 def train(n):
    return reader_creator('./simple-examples/data/ptb.train.txt', n)
 def test(n):
    return reader_creator('./simple-examples/data/ptb.valid.txt', n)
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@ -9,9 +9,9 @@ __all__ = ['train', 'test']
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '25e3cc63507ef6e98d5dc541e8672bb6'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
 TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = '4e9511fe019b2189026bd0421ba7b688'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
 TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
 TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
 TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
@ -35,6 +35,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
        l.stdout.read(8)  # skip some magic bytes
        try:  # reader could be break.
            while True:
                labels = numpy.fromfile(
                    l.stdout, 'ubyte', count=buffer_size).astype("int")
@ -50,7 +51,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
                for i in xrange(buffer_size):
                    yield images[i, :], int(labels[i])
-
+        finally:
            m.terminate()
            l.terminate()
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@ -0,0 +1,43 @@
 import paddle.v2.dataset.imdb
 import unittest
 import re
 TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
 TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
 TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
 TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
 TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
 TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
 class TestIMDB(unittest.TestCase):
    word_idx = None
    def test_build_dict(self):
        if self.word_idx == None:
            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
                                                              150)
        self.assertEqual(len(self.word_idx), 7036)
    def check_dataset(self, dataset, expected_size):
        if self.word_idx == None:
            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
                                                              150)
        sum = 0
        for l in dataset(self.word_idx):
            self.assertEqual(l[1], sum % 2)
            sum += 1
        self.assertEqual(sum, expected_size)
    def test_train(self):
        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
    def test_test(self):
        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@ -0,0 +1,20 @@
 import paddle.v2.dataset.imikolov
 import unittest
 class TestMikolov(unittest.TestCase):
    def check_reader(self, reader, n):
        for l in reader():
            self.assertEqual(len(l), n)
    def test_train(self):
        n = 5
        self.check_reader(paddle.v2.dataset.imikolov.train(n), n)
    def test_test(self):
        n = 5
        self.check_reader(paddle.v2.dataset.imikolov.test(n), n)
 if __name__ == '__main__':
    unittest.main()
--- a/Show More
+++ b/Show More