From a3a6997e3213d9d895e83d6c83130a060a6aafa8 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 8 Feb 2017 19:38:32 -0800 Subject: [PATCH 1/9] Add draft new_api.md --- doc/api/new_api.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 doc/api/new_api.md diff --git a/doc/api/new_api.md b/doc/api/new_api.md new file mode 100644 index 0000000000..5a90cd1c75 --- /dev/null +++ b/doc/api/new_api.md @@ -0,0 +1,38 @@ +import yi_json + +g = 100 +def read(): + queue q; + # warmup q + for i = 0 : 1000 + q.push(read()) + yield q.shuffle_get() + +input = paddle.layer.data(...) +intermediate = paddle.layers.fc(input) +output = paddle.layer.softmax(intermediate) + +model = paddle.model.create(output) + +train(model, data_provider=read, cluster="clusterId") + +#-------------------------------------------------------------------------------- + +# 1. package, docker build, docker push +# 2. kubectl, clusterId Kuberentes job, 10 trainer containers, 5 parameter server containers + +#-------------------------------------------------------------------------------- + +def train(): + if os.environ["kube_api_server"] == nil: + docker_build() + docker_push() + kube_ctrl_start_job() + else: + rank = kube_mpi_rank() + if rank == 0: + master() + elif rank >= 15: + parameter_server() + else: + _train() From 642e25e93be136ba012b52dfe44607cf92f0a0b6 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 13:14:13 -0800 Subject: [PATCH 2/9] Move new_api.md into design/api.md --- doc/api/{new_api.md => design/api.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/api/{new_api.md => design/api.md} (100%) diff --git a/doc/api/new_api.md b/doc/api/design/api.md similarity index 100% rename from doc/api/new_api.md rename to doc/api/design/api.md From 36036c0ea57fdd0d9df8a4a77d8ebce89f65552b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 13:15:37 -0800 Subject: [PATCH 3/9] Correct directory structure --- doc/{api => }/design/api.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/{api => }/design/api.md (100%) diff --git a/doc/api/design/api.md b/doc/design/api.md similarity index 100% rename from doc/api/design/api.md rename to doc/design/api.md From dd229dc7406de698601835d1d6e025d327bfe165 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 16:03:08 -0800 Subject: [PATCH 4/9] Update api.md --- doc/design/api.md | 128 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 5a90cd1c75..3cfb67cb00 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,38 +1,90 @@ -import yi_json - -g = 100 -def read(): - queue q; - # warmup q - for i = 0 : 1000 - q.push(read()) - yield q.shuffle_get() - -input = paddle.layer.data(...) -intermediate = paddle.layers.fc(input) -output = paddle.layer.softmax(intermediate) - -model = paddle.model.create(output) - -train(model, data_provider=read, cluster="clusterId") - -#-------------------------------------------------------------------------------- - -# 1. package, docker build, docker push -# 2. kubectl, clusterId Kuberentes job, 10 trainer containers, 5 parameter server containers - -#-------------------------------------------------------------------------------- - -def train(): - if os.environ["kube_api_server"] == nil: - docker_build() - docker_push() - kube_ctrl_start_job() - else: - rank = kube_mpi_rank() - if rank == 0: - master() - elif rank >= 15: - parameter_server() - else: - _train() +# Design Doc: PaddlePaddle API + +## Ingredients + +As the first step of our design, we list important concepts in deep +learning and try to figure their relationship, as shown below: + +``` +Model = {topology, parameters} + +Evaluator = {Model*, activations} +- forward +- test + +GradientMachine = {Model*, gradients} +- backward + +Optimizer = {Model*, Evaluator*, GradientMachine*} +- train +- update +- checkpoint +``` + +where the pair of curly braces `{` and `}` indicate *composition*, `*` +indicates a *reference*, and `-` marks a "class method". + + +### Model + +We used to think that parameters are part of the toplogy (or layers). +But that is not true, because multiple layers could share the same +parameter matrix. An example is a network that compares two text +segments in a semantic space: + +``` + semantic +text A -> projection ---\ + layer A \ + cosine + similarity -> output + layer + semantic / +text B -> projection ---/ + layer B +``` + +In this network, the two semantic projection layers (A and B) share +the same parameter matrix. + +For more information about our API that specifies topology and +parameter sharing, please refer to [TODO: API]. + + +### Evaluator + +Supposed that we have a trained ranking model, we should be able to +use it in our search engine. The search engine's Web server is a +concurrent program so to serve many HTTP requests simultaneously. It +doens't make sense for each of these threads to have its own copy of +model, because that would duplicate topologies and parameters. +However, each thread should be able to record layer outputs, i.e., +activations, computed from an input, derived from the request. With +*Evaluator* that saves activations, we can write the over-simplified +server program as: + +```python +m = paddle.model.load("trained.model") + +http.handle("/", + lambda req: + e = paddle.evaluator.create(m) + e.forward(req) + e.activation(layer="output")) # returns activations of layer "output" +``` + +### GradientMachine + +Similar to the evaluation, the training needs to compute gradients so +to update model parameters. Because an [optimizer](#optimizer) might +run multiple simultaneous threads to update the same model, gradients +should be separated from the model. Because gradients are only used +in training, but not serving, they should be separate from Evaluator. +Hence the `GradientMachine`. + +### Optimizer + +None of Model, Evaluator, nor GradientMachine implements the training +loop, hence Optimizer. We can define a concurrent optimizer that runs +multiple simultaneious threads to train a model -- just let each +thread has its own GradientMachine object. From e4eacd5810d7253c3444b465c8284c88f1b0e1f6 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:14:38 -0800 Subject: [PATCH 5/9] Add Programming section --- doc/design/api.md | 201 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 194 insertions(+), 7 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 3cfb67cb00..d8825389cb 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,4 +1,4 @@ -# Design Doc: PaddlePaddle API +# PaddlePaddle API ## Ingredients @@ -27,8 +27,8 @@ indicates a *reference*, and `-` marks a "class method". ### Model -We used to think that parameters are part of the toplogy (or layers). -But that is not true, because multiple layers could share the same +We used to think that parameters are part of the topology (or layers). +But that is not true because multiple layers could share the same parameter matrix. An example is a network that compares two text segments in a semantic space: @@ -56,8 +56,7 @@ parameter sharing, please refer to [TODO: API]. Supposed that we have a trained ranking model, we should be able to use it in our search engine. The search engine's Web server is a concurrent program so to serve many HTTP requests simultaneously. It -doens't make sense for each of these threads to have its own copy of -model, because that would duplicate topologies and parameters. +doesn't make sense for each of these threads to have its own copy of the model because that would duplicate topologies and parameters. However, each thread should be able to record layer outputs, i.e., activations, computed from an input, derived from the request. With *Evaluator* that saves activations, we can write the over-simplified @@ -70,7 +69,7 @@ http.handle("/", lambda req: e = paddle.evaluator.create(m) e.forward(req) - e.activation(layer="output")) # returns activations of layer "output" + e.activation(layer="output")) # returns activations of layer "output" ``` ### GradientMachine @@ -86,5 +85,193 @@ Hence the `GradientMachine`. None of Model, Evaluator, nor GradientMachine implements the training loop, hence Optimizer. We can define a concurrent optimizer that runs -multiple simultaneious threads to train a model -- just let each +multiple simultaneous threads to train a model -- just let each thread has its own GradientMachine object. + +Most models should be able to be trained using the +`paddle.optimizer.SGD` by calling its `train` method. Many +customizations to the SGD algorithm happens with the update equation, +e.g., momentum and the Adam SGD algorithm. We make `train` calls +`update` to do an update, so that we can derive a `paddle.optimizer.Adam` +from `paddle.optimizer.SGD` by overrides only the `update` method. + + +## Programming + +A fictive example of PaddlePaddle program looks like the following: + +```python +import paddle + +def read(args): + f = open_file(args["filename"]) + mb = read_a_minibatch(f) + end_pass = eof(f) + if end_pass: + f = open_file(args["filename"]) # rewind for reading again + yield mb, end_pass + +input = paddle.layer.data(...) +intermediate = paddle.layers.fc(input) +output = paddle.layer.softmax(intermediate) + +model = paddle.model.create(output) + +paddle.train(model, data_provider=read) +``` + +This shows some important part of a program: + +1. Define how to read (and augment) data by defining a function, in + this example, `read`, that `yields` a minibatch and a boolean flag + `eof_of_pass`. + +1. Define the topology, `input`, `intermediate`, and `output` in this + example. + +1. Create parameters from the topology thus forms the model by calling + `paddel.model.create`. + +1. Train the model by calling `paddle.train`. + + +### Reader + +Not all programming frameworks allow users to define I/O functions. +An example is Google MapReduce, which can only read from text, +SSTable, and RecordIO files. Hadoop MapReduce allows users to define +readers and writers by deriving from base classes `Reader` and +`Writer`. The former is less flexible but also less error-prone. We +decide to provide the flexibility to users to define their readers. + + +#### A Synthetic Data Reader + +Sometimes we want to test a topology and/or a training algorithm using +synthetic data. We can do this by defining the reader a synthesizer: + +```python +def read(args): + x = sample_from_uniform(0.0, 1.0) + y = sample_from_gauss(2 * x, sigma) + yield {x, y}, False # no end-of-file so no end-of-pass +``` + +#### A Reader for Online Learning + +Readers can also read an infinite data stream, e.g., a log stream from +a search engine and collected by Kafka: + +```python +def read(args): + log_stream = kafka.open_channel(args["kafka channel name"]) + yeild log_stream.read(), False # no end-of-pass in online learning +``` + +### Topology + +By default, layers don't have names. But if we want to refer to a +layer later some time, for example, when we do serving using the model +and wants activations/outputs of a layer, we should give it a name. + +```python +input = paddle.layer.data(...) +intermediate = paddle.layer.fc(input, name="inter", ...) +output = paddle.layer.softmax(intermediate, name="output", ...) + +m = paddle.model.create(output) +e = paddle.evaluator.create(model) +e.forward(read_an_input()) # compute activations of all layers. +print e.activations(layer="inter") # retrieve the activations of layer "inter" +print e.activations(layer="output") # retrieve the activations of layer "output" +``` + +#### Sharing Parameters + +In [above section](#model) we shows a network whose two layers share +the same parameter matrix. To specify such cases, we give "parameter +names" to layers. If some layers have the same paraemter names, +`paddle.model.create` creates a single parameter matrix for these +layers: + +```python +text1 = paddle.layer.data(...) +sematic1 = paddle.layer.fc(text1, ..., parameter_name="sematic_projection") +text2 = paddle.layer.data(...) +sematic2 = paddle.layer.fc(text2, ..., parameter_name="sematic_projection") +out = paddle.layer.cosine(semantic1, semantic2) +``` + +We can also share parameter matrices between layers in different +models. To do this, we need an additional parameter that refers to a +model: + +```python +model1_input = paddle.layer.data(...) +model1_output = paddle.layer.softmax(model1_input, ..., + parameter_name="a_parameter_matrix") +model1 = paddle.model.create(model1_output) + +# Another model +model2_semantic = paddle.layer.fc(text2, ..., + parameter_name="a_parameter_matrix", + parameter_model=model1) +``` + +### Training + +The recommended way to training a model is to call `paddle.train`, +which simply calls `paddle.optimizer.Default`, a global variable of +type `paddle.optimizer.SGD`. Equivalently, we can do + +```python +opt = paddle.optimizer.SGD(...) +opt.train(model, reader=read, ...) +``` + +#### Distributed Training + +If users want to do distributed training on a cluster, s/he should +call `paddle.dist_train` and provides access tokens to the cluster as +a parameter. + +For example, if the user has a TLS certificate that allows him to +access a Kubernetes cluster, s/he should be able to call + +```python +paddle.dist_train(model, + reader=read, + optimizer=paddle.optimizer.SGDOptimizer(...), + k8s_user="yi", + k8s_token="kube_cluster_tls.pem", + k8s_job="hello", + num_parameter_servers=15) +``` + +The pseudo code if `paddle.dist_train` is as follows: + +```python +def dist_train(): + if os.getenv("KUBERNETES_SERVICE_HOST") == None: + image_name = k8s_user + '/' + k8s_job + docker_build(image_name) + docker_push() + kube_ctrl_start_job(image_name, k8s_user, k8s_token) + else: + rank = kube_list_containers_in_job_and_return_current_containers_rank() + if rank == 0: + master() + elif rank < 15: + parameter_server() + else: + optimizer.train(model, reader=read) +``` + +Please be aware that if a process is running on the Kubernetes +cluster, it will have some environment variables pre-defined. + +If `dist_train` doesn't see these environment variables, it knowns +that it's running on users' personal computer, and it should work as a +*launcher*. Otherwise, it knows that it's running on the cluster and +need to figure out its role as either the master, or a trainer, or a +parameter server. From b17bc8a88ff5191ff75985bf36a149df796b508d Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:39:22 -0800 Subject: [PATCH 6/9] GradientMachien refers to Evaluator but not Model --- doc/design/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/api.md b/doc/design/api.md index d8825389cb..fe37cf091c 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -12,7 +12,7 @@ Evaluator = {Model*, activations} - forward - test -GradientMachine = {Model*, gradients} +GradientMachine = {Evaluator*, gradients} - backward Optimizer = {Model*, Evaluator*, GradientMachine*} From 08c0051f95bdb2e3334d228fef8e1bc465e8ce5f Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:52:53 -0800 Subject: [PATCH 7/9] Fix problem pointed out by Long-Fei --- doc/design/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/api.md b/doc/design/api.md index fe37cf091c..17ab50fdb2 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -15,7 +15,7 @@ Evaluator = {Model*, activations} GradientMachine = {Evaluator*, gradients} - backward -Optimizer = {Model*, Evaluator*, GradientMachine*} +Optimizer = {Model*, GradientMachine*} - train - update - checkpoint From e60fe1df22fa9cd37003c258474b2aa1b8cde500 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 18:56:51 -0800 Subject: [PATCH 8/9] Cost as a parameter to train and test --- doc/design/api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index 17ab50fdb2..dcf19451b1 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -10,13 +10,13 @@ Model = {topology, parameters} Evaluator = {Model*, activations} - forward -- test +- test(cost, ...) GradientMachine = {Evaluator*, gradients} - backward Optimizer = {Model*, GradientMachine*} -- train +- train(cost, ...) - update - checkpoint ``` From 6de262c357ef9274da304ffa804df935db103bc4 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 9 Feb 2017 19:51:28 -0800 Subject: [PATCH 9/9] Correct minor problems --- doc/design/api.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/design/api.md b/doc/design/api.md index dcf19451b1..dd4341b324 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -1,4 +1,4 @@ -# PaddlePaddle API +# PaddlePaddle Design Doc ## Ingredients @@ -15,7 +15,7 @@ Evaluator = {Model*, activations} GradientMachine = {Evaluator*, gradients} - backward -Optimizer = {Model*, GradientMachine*} +Optimizer = {GradientMachine*} - train(cost, ...) - update - checkpoint @@ -96,7 +96,7 @@ e.g., momentum and the Adam SGD algorithm. We make `train` calls from `paddle.optimizer.SGD` by overrides only the `update` method. -## Programming +## Programming Interface A fictive example of PaddlePaddle program looks like the following: