From a3a6997e3213d9d895e83d6c83130a060a6aafa8 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 8 Feb 2017 19:38:32 -0800 Subject: [PATCH 01/13] Add draft new_api.md --- doc/api/new_api.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 doc/api/new_api.md diff --git a/doc/api/new_api.md b/doc/api/new_api.md new file mode 100644 index 0000000000..5a90cd1c75 --- /dev/null +++ b/doc/api/new_api.md @@ -0,0 +1,38 @@ +import yi_json + +g = 100 +def read(): + queue q; + # warmup q + for i = 0 : 1000 + q.push(read()) + yield q.shuffle_get() + +input = paddle.layer.data(...) +intermediate = paddle.layers.fc(input) +output = paddle.layer.softmax(intermediate) + +model = paddle.model.create(output) + +train(model, data_provider=read, cluster="clusterId") + +#-------------------------------------------------------------------------------- + +# 1. package, docker build, docker push +# 2. kubectl, clusterId Kuberentes job, 10 trainer containers, 5 parameter server containers + +#-------------------------------------------------------------------------------- + +def train(): + if os.environ["kube_api_server"] == nil: + docker_build() + docker_push() + kube_ctrl_start_job() + else: + rank = kube_mpi_rank() + if rank == 0: + master() + elif rank >= 15: + parameter_server() + else: + _train() From 642e25e93be136ba012b52dfe44607cf92f0a0b6 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 13:14:13 -0800 Subject: [PATCH 02/13] Move new_api.md into design/api.md --- doc/api/{new_api.md => design/api.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/api/{new_api.md => design/api.md} (100%) diff --git a/doc/api/new_api.md b/doc/api/design/api.md similarity index 100% rename from doc/api/new_api.md rename to doc/api/design/api.md From 36036c0ea57fdd0d9df8a4a77d8ebce89f65552b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 13:15:37 -0800 Subject: [PATCH 03/13] Correct directory structure --- doc/{api => }/design/api.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/{api => }/design/api.md (100%) diff --git a/doc/api/design/api.md b/doc/design/api.md similarity index 100% rename from doc/api/design/api.md rename to doc/design/api.md From dd229dc7406de698601835d1d6e025d327bfe165 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 16:03:08 -0800 Subject: [PATCH 04/13] Update api.md --- doc/design/api.md | 128 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 5a90cd1c75..3cfb67cb00 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,38 +1,90 @@ -import yi_json - -g = 100 -def read(): - queue q; - # warmup q - for i = 0 : 1000 - q.push(read()) - yield q.shuffle_get() - -input = paddle.layer.data(...) -intermediate = paddle.layers.fc(input) -output = paddle.layer.softmax(intermediate) - -model = paddle.model.create(output) - -train(model, data_provider=read, cluster="clusterId") - -#-------------------------------------------------------------------------------- - -# 1. package, docker build, docker push -# 2. kubectl, clusterId Kuberentes job, 10 trainer containers, 5 parameter server containers - -#-------------------------------------------------------------------------------- - -def train(): - if os.environ["kube_api_server"] == nil: - docker_build() - docker_push() - kube_ctrl_start_job() - else: - rank = kube_mpi_rank() - if rank == 0: - master() - elif rank >= 15: - parameter_server() - else: - _train() +# Design Doc: PaddlePaddle API + +## Ingredients + +As the first step of our design, we list important concepts in deep +learning and try to figure their relationship, as shown below: + +``` +Model = {topology, parameters} + +Evaluator = {Model*, activations} +- forward +- test + +GradientMachine = {Model*, gradients} +- backward + +Optimizer = {Model*, Evaluator*, GradientMachine*} +- train +- update +- checkpoint +``` + +where the pair of curly braces `{` and `}` indicate *composition*, `*` +indicates a *reference*, and `-` marks a "class method". + + +### Model + +We used to think that parameters are part of the toplogy (or layers). +But that is not true, because multiple layers could share the same +parameter matrix. An example is a network that compares two text +segments in a semantic space: + +``` + semantic +text A -> projection ---\ + layer A \ + cosine + similarity -> output + layer + semantic / +text B -> projection ---/ + layer B +``` + +In this network, the two semantic projection layers (A and B) share +the same parameter matrix. + +For more information about our API that specifies topology and +parameter sharing, please refer to [TODO: API]. + + +### Evaluator + +Supposed that we have a trained ranking model, we should be able to +use it in our search engine. The search engine's Web server is a +concurrent program so to serve many HTTP requests simultaneously. It +doens't make sense for each of these threads to have its own copy of +model, because that would duplicate topologies and parameters. +However, each thread should be able to record layer outputs, i.e., +activations, computed from an input, derived from the request. With +*Evaluator* that saves activations, we can write the over-simplified +server program as: + +```python +m = paddle.model.load("trained.model") + +http.handle("/", + lambda req: + e = paddle.evaluator.create(m) + e.forward(req) + e.activation(layer="output")) # returns activations of layer "output" +``` + +### GradientMachine + +Similar to the evaluation, the training needs to compute gradients so +to update model parameters. Because an [optimizer](#optimizer) might +run multiple simultaneous threads to update the same model, gradients +should be separated from the model. Because gradients are only used +in training, but not serving, they should be separate from Evaluator. +Hence the `GradientMachine`. + +### Optimizer + +None of Model, Evaluator, nor GradientMachine implements the training +loop, hence Optimizer. We can define a concurrent optimizer that runs +multiple simultaneious threads to train a model -- just let each +thread has its own GradientMachine object. From e4eacd5810d7253c3444b465c8284c88f1b0e1f6 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:14:38 -0800 Subject: [PATCH 05/13] Add Programming section --- doc/design/api.md | 201 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 194 insertions(+), 7 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 3cfb67cb00..d8825389cb 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,4 +1,4 @@ -# Design Doc: PaddlePaddle API +# PaddlePaddle API ## Ingredients @@ -27,8 +27,8 @@ indicates a *reference*, and `-` marks a "class method". ### Model -We used to think that parameters are part of the toplogy (or layers). -But that is not true, because multiple layers could share the same +We used to think that parameters are part of the topology (or layers). +But that is not true because multiple layers could share the same parameter matrix. An example is a network that compares two text segments in a semantic space: @@ -56,8 +56,7 @@ parameter sharing, please refer to [TODO: API]. Supposed that we have a trained ranking model, we should be able to use it in our search engine. The search engine's Web server is a concurrent program so to serve many HTTP requests simultaneously. It -doens't make sense for each of these threads to have its own copy of -model, because that would duplicate topologies and parameters. +doesn't make sense for each of these threads to have its own copy of the model because that would duplicate topologies and parameters. However, each thread should be able to record layer outputs, i.e., activations, computed from an input, derived from the request. With *Evaluator* that saves activations, we can write the over-simplified @@ -70,7 +69,7 @@ http.handle("/", lambda req: e = paddle.evaluator.create(m) e.forward(req) - e.activation(layer="output")) # returns activations of layer "output" + e.activation(layer="output")) # returns activations of layer "output" ``` ### GradientMachine @@ -86,5 +85,193 @@ Hence the `GradientMachine`. None of Model, Evaluator, nor GradientMachine implements the training loop, hence Optimizer. We can define a concurrent optimizer that runs -multiple simultaneious threads to train a model -- just let each +multiple simultaneous threads to train a model -- just let each thread has its own GradientMachine object. + +Most models should be able to be trained using the +`paddle.optimizer.SGD` by calling its `train` method. Many +customizations to the SGD algorithm happens with the update equation, +e.g., momentum and the Adam SGD algorithm. We make `train` calls +`update` to do an update, so that we can derive a `paddle.optimizer.Adam` +from `paddle.optimizer.SGD` by overrides only the `update` method. + + +## Programming + +A fictive example of PaddlePaddle program looks like the following: + +```python +import paddle + +def read(args): + f = open_file(args["filename"]) + mb = read_a_minibatch(f) + end_pass = eof(f) + if end_pass: + f = open_file(args["filename"]) # rewind for reading again + yield mb, end_pass + +input = paddle.layer.data(...) +intermediate = paddle.layers.fc(input) +output = paddle.layer.softmax(intermediate) + +model = paddle.model.create(output) + +paddle.train(model, data_provider=read) +``` + +This shows some important part of a program: + +1. Define how to read (and augment) data by defining a function, in + this example, `read`, that `yields` a minibatch and a boolean flag + `eof_of_pass`. + +1. Define the topology, `input`, `intermediate`, and `output` in this + example. + +1. Create parameters from the topology thus forms the model by calling + `paddel.model.create`. + +1. Train the model by calling `paddle.train`. + + +### Reader + +Not all programming frameworks allow users to define I/O functions. +An example is Google MapReduce, which can only read from text, +SSTable, and RecordIO files. Hadoop MapReduce allows users to define +readers and writers by deriving from base classes `Reader` and +`Writer`. The former is less flexible but also less error-prone. We +decide to provide the flexibility to users to define their readers. + + +#### A Synthetic Data Reader + +Sometimes we want to test a topology and/or a training algorithm using +synthetic data. We can do this by defining the reader a synthesizer: + +```python +def read(args): + x = sample_from_uniform(0.0, 1.0) + y = sample_from_gauss(2 * x, sigma) + yield {x, y}, False # no end-of-file so no end-of-pass +``` + +#### A Reader for Online Learning + +Readers can also read an infinite data stream, e.g., a log stream from +a search engine and collected by Kafka: + +```python +def read(args): + log_stream = kafka.open_channel(args["kafka channel name"]) + yeild log_stream.read(), False # no end-of-pass in online learning +``` + +### Topology + +By default, layers don't have names. But if we want to refer to a +layer later some time, for example, when we do serving using the model +and wants activations/outputs of a layer, we should give it a name. + +```python +input = paddle.layer.data(...) +intermediate = paddle.layer.fc(input, name="inter", ...) +output = paddle.layer.softmax(intermediate, name="output", ...) + +m = paddle.model.create(output) +e = paddle.evaluator.create(model) +e.forward(read_an_input()) # compute activations of all layers. +print e.activations(layer="inter") # retrieve the activations of layer "inter" +print e.activations(layer="output") # retrieve the activations of layer "output" +``` + +#### Sharing Parameters + +In [above section](#model) we shows a network whose two layers share +the same parameter matrix. To specify such cases, we give "parameter +names" to layers. If some layers have the same paraemter names, +`paddle.model.create` creates a single parameter matrix for these +layers: + +```python +text1 = paddle.layer.data(...) +sematic1 = paddle.layer.fc(text1, ..., parameter_name="sematic_projection") +text2 = paddle.layer.data(...) +sematic2 = paddle.layer.fc(text2, ..., parameter_name="sematic_projection") +out = paddle.layer.cosine(semantic1, semantic2) +``` + +We can also share parameter matrices between layers in different +models. To do this, we need an additional parameter that refers to a +model: + +```python +model1_input = paddle.layer.data(...) +model1_output = paddle.layer.softmax(model1_input, ..., + parameter_name="a_parameter_matrix") +model1 = paddle.model.create(model1_output) + +# Another model +model2_semantic = paddle.layer.fc(text2, ..., + parameter_name="a_parameter_matrix", + parameter_model=model1) +``` + +### Training + +The recommended way to training a model is to call `paddle.train`, +which simply calls `paddle.optimizer.Default`, a global variable of +type `paddle.optimizer.SGD`. Equivalently, we can do + +```python +opt = paddle.optimizer.SGD(...) +opt.train(model, reader=read, ...) +``` + +#### Distributed Training + +If users want to do distributed training on a cluster, s/he should +call `paddle.dist_train` and provides access tokens to the cluster as +a parameter. + +For example, if the user has a TLS certificate that allows him to +access a Kubernetes cluster, s/he should be able to call + +```python +paddle.dist_train(model, + reader=read, + optimizer=paddle.optimizer.SGDOptimizer(...), + k8s_user="yi", + k8s_token="kube_cluster_tls.pem", + k8s_job="hello", + num_parameter_servers=15) +``` + +The pseudo code if `paddle.dist_train` is as follows: + +```python +def dist_train(): + if os.getenv("KUBERNETES_SERVICE_HOST") == None: + image_name = k8s_user + '/' + k8s_job + docker_build(image_name) + docker_push() + kube_ctrl_start_job(image_name, k8s_user, k8s_token) + else: + rank = kube_list_containers_in_job_and_return_current_containers_rank() + if rank == 0: + master() + elif rank < 15: + parameter_server() + else: + optimizer.train(model, reader=read) +``` + +Please be aware that if a process is running on the Kubernetes +cluster, it will have some environment variables pre-defined. + +If `dist_train` doesn't see these environment variables, it knowns +that it's running on users' personal computer, and it should work as a +*launcher*. Otherwise, it knows that it's running on the cluster and +need to figure out its role as either the master, or a trainer, or a +parameter server. From b17bc8a88ff5191ff75985bf36a149df796b508d Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:39:22 -0800 Subject: [PATCH 06/13] GradientMachien refers to Evaluator but not Model --- doc/design/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/api.md b/doc/design/api.md index d8825389cb..fe37cf091c 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -12,7 +12,7 @@ Evaluator = {Model*, activations} - forward - test -GradientMachine = {Model*, gradients} +GradientMachine = {Evaluator*, gradients} - backward Optimizer = {Model*, Evaluator*, GradientMachine*} From 08c0051f95bdb2e3334d228fef8e1bc465e8ce5f Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:52:53 -0800 Subject: [PATCH 07/13] Fix problem pointed out by Long-Fei --- doc/design/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/api.md b/doc/design/api.md index fe37cf091c..17ab50fdb2 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -15,7 +15,7 @@ Evaluator = {Model*, activations} GradientMachine = {Evaluator*, gradients} - backward -Optimizer = {Model*, Evaluator*, GradientMachine*} +Optimizer = {Model*, GradientMachine*} - train - update - checkpoint From e60fe1df22fa9cd37003c258474b2aa1b8cde500 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:56:51 -0800 Subject: [PATCH 08/13] Cost as a parameter to train and test --- doc/design/api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 17ab50fdb2..dcf19451b1 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -10,13 +10,13 @@ Model = {topology, parameters} Evaluator = {Model*, activations} - forward -- test +- test(cost, ...) GradientMachine = {Evaluator*, gradients} - backward Optimizer = {Model*, GradientMachine*} -- train +- train(cost, ...) - update - checkpoint ``` From 6de262c357ef9274da304ffa804df935db103bc4 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 19:51:28 -0800 Subject: [PATCH 09/13] Correct minor problems --- doc/design/api.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index dcf19451b1..dd4341b324 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,4 +1,4 @@ -# PaddlePaddle API +# PaddlePaddle Design Doc ## Ingredients @@ -15,7 +15,7 @@ Evaluator = {Model*, activations} GradientMachine = {Evaluator*, gradients} - backward -Optimizer = {Model*, GradientMachine*} +Optimizer = {GradientMachine*} - train(cost, ...) - update - checkpoint @@ -96,7 +96,7 @@ e.g., momentum and the Adam SGD algorithm. We make `train` calls from `paddle.optimizer.SGD` by overrides only the `update` method. -## Programming +## Programming Interface A fictive example of PaddlePaddle program looks like the following: From 970440622fd985afeaed76ea812e4779bb0bf901 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 10 Feb 2017 17:08:39 +0800 Subject: [PATCH 10/13] Temporary disable async load data in PyDP2. * It seems some other change breaks the async load. It will cause data error in GPU mode. --- paddle/gserver/dataproviders/PyDataProvider2.cpp | 2 +- python/paddle/trainer_config_helpers/data_sources.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp index c26e242534..b8079dc079 100644 --- a/paddle/gserver/dataproviders/PyDataProvider2.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp @@ -647,7 +647,7 @@ public: DataBatch& gpuBatch = *batch; std::vector& gpuArguments = gpuBatch.getStreams(); gpuArguments.resize(cpuArguments.size()); - gpuBatch.setSize(size); + gpuBatch.setSize(bsize); for (size_t i = 0; i < headers_.size(); ++i) { gpuArguments[i].resizeAndCopyFrom( cpuArguments[i], useGpu_, HPPL_STREAM_1); diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py index 0ea8fc77ee..ab9a2562dc 100644 --- a/python/paddle/trainer_config_helpers/data_sources.py +++ b/python/paddle/trainer_config_helpers/data_sources.py @@ -201,7 +201,7 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None): data.load_data_module = load_data_module data.load_data_object = load_data_object data.load_data_args = load_data_args - data.async_load_data = True + data.async_load_data = False return data define_py_data_sources( From ff6f7827dd6e067ff759d84c4a7519cf09522d02 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 10 Feb 2017 22:17:37 +0800 Subject: [PATCH 11/13] Fix protobuf version in travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 162bebba09..f9ad1c9203 100644 --- a/.travis.yml +++ b/.travis.yml @@ -54,7 +54,7 @@ before_install: fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker + - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker script: - paddle/scripts/travis/main.sh notifications: From 71c3c93c72409118b928b01478af95081a00114a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 10 Feb 2017 22:39:46 +0800 Subject: [PATCH 12/13] Fix unittest --- .../tests/configs/protostr/test_split_datasource.protostr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr index 1cfb92255a..569b0b945a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr @@ -19,7 +19,7 @@ model_config { data_config { type: "py2" files: "train.list" - async_load_data: true + async_load_data: false for_test: false load_data_module: "a" load_data_object: "c" @@ -58,7 +58,7 @@ opt_config { test_data_config { type: "py2" files: "test.list" - async_load_data: true + async_load_data: false for_test: true load_data_module: "b" load_data_object: "d" From 67855d3b4edec4780a16e9583ab183753acf1581 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 12 Feb 2017 16:07:27 +0800 Subject: [PATCH 13/13] Add comments --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index f9ad1c9203..5d82d9729b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -54,6 +54,8 @@ before_install: fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi + # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python + # protobuf version. - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker script: - paddle/scripts/travis/main.sh