commit
64800cfebc
@ -1,10 +1,22 @@
|
||||
Distributed Training
|
||||
====================
|
||||
|
||||
In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
|
||||
|
||||
.. image:: src/ps_en.png
|
||||
:width: 500
|
||||
|
||||
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
|
||||
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
|
||||
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
|
||||
|
||||
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
|
||||
|
||||
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
introduction_en.md
|
||||
preparations_en.md
|
||||
cmd_argument_en.md
|
||||
multi_cluster/index_en.rst
|
||||
|
@ -1,13 +0,0 @@
|
||||
## Introduction
|
||||
|
||||
In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
|
||||
|
||||
<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
|
||||
|
||||
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
|
||||
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
|
||||
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
|
||||
|
||||
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
|
||||
|
||||
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
|
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 142 KiB |
@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "paddle/framework/init.h"
|
||||
#include "paddle/framework/mixed_vector.h"
|
||||
|
||||
using namespace paddle::framework;
|
||||
using namespace paddle::platform;
|
||||
using namespace paddle::memory;
|
||||
|
||||
template <typename T>
|
||||
__global__ void test(T* data, int size) {
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
|
||||
i += blockDim.x * gridDim.x) {
|
||||
data[i] *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Vector, Normal) {
|
||||
// fill the device context pool.
|
||||
InitDevices();
|
||||
|
||||
Vector<size_t> vec({1, 2, 3});
|
||||
size_t* ptr = vec.data();
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
EXPECT_EQ(vec[i], *(ptr + i));
|
||||
}
|
||||
|
||||
vec.clear();
|
||||
vec.CopyFromCUDA();
|
||||
|
||||
std::vector<size_t> v = {1, 2, 3};
|
||||
for (size_t i = 0; i < v.size(); ++i) {
|
||||
EXPECT_EQ(v[i], vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Vector, MultipleCopy) {
|
||||
InitDevices();
|
||||
Vector<size_t> vec({1, 2, 3});
|
||||
CUDAPlace place(0);
|
||||
vec.mutable_data(place);
|
||||
auto vec2 = Vector<size_t>(vec);
|
||||
{
|
||||
const size_t* ptr = vec2.data(CPUPlace());
|
||||
for (size_t i = 0; i < vec2.size(); ++i) {
|
||||
EXPECT_EQ(*(ptr + i), vec[i]);
|
||||
}
|
||||
}
|
||||
test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
|
||||
vec2.CopyFromCUDA();
|
||||
{
|
||||
const size_t* ptr = vec2.data(CPUPlace());
|
||||
for (size_t i = 0; i < vec2.size(); ++i) {
|
||||
EXPECT_EQ(*(ptr + i), vec[i] * 2);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,31 +1,30 @@
|
||||
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
|
||||
cc_test(test_inference_recognize_digits_mlp
|
||||
SRCS test_inference_recognize_digits.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
|
||||
cc_test(test_inference_image_classification_vgg
|
||||
SRCS test_inference_image_classification.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_vgg.inference.model)
|
||||
cc_test(test_inference_image_classification_resnet
|
||||
SRCS test_inference_image_classification.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_resnet.inference.model)
|
||||
cc_test(test_inference_label_semantic_roles
|
||||
SRCS test_inference_label_semantic_roles.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/label_semantic_roles.inference.model)
|
||||
cc_test(test_inference_rnn_encoder_decoder
|
||||
SRCS test_inference_rnn_encoder_decoder.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/rnn_encoder_decoder.inference.model)
|
||||
set_tests_properties(test_inference_recognize_digits_mlp
|
||||
PROPERTIES DEPENDS test_recognize_digits)
|
||||
set_tests_properties(test_inference_image_classification_vgg
|
||||
PROPERTIES DEPENDS test_image_classification_train)
|
||||
set_tests_properties(test_inference_image_classification_resnet
|
||||
PROPERTIES DEPENDS test_image_classification_train)
|
||||
set_tests_properties(test_inference_label_semantic_roles
|
||||
PROPERTIES DEPENDS test_label_semantic_roles)
|
||||
set_tests_properties(test_inference_rnn_encoder_decoder
|
||||
PROPERTIES DEPENDS test_rnn_encoder_decoder)
|
||||
function(inference_test TARGET_NAME)
|
||||
set(options "")
|
||||
set(oneValueArgs "")
|
||||
set(multiValueArgs ARGS)
|
||||
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
|
||||
if(inference_test_ARGS)
|
||||
foreach(arg ${inference_test_ARGS})
|
||||
cc_test(test_inference_${TARGET_NAME}_${arg}
|
||||
SRCS test_inference_${TARGET_NAME}.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model)
|
||||
set_tests_properties(test_inference_${TARGET_NAME}_${arg}
|
||||
PROPERTIES DEPENDS test_${TARGET_NAME})
|
||||
endforeach()
|
||||
else()
|
||||
cc_test(test_inference_${TARGET_NAME}
|
||||
SRCS test_inference_${TARGET_NAME}.cc
|
||||
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
|
||||
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model)
|
||||
set_tests_properties(test_inference_${TARGET_NAME}
|
||||
PROPERTIES DEPENDS test_${TARGET_NAME})
|
||||
endif()
|
||||
endfunction(inference_test)
|
||||
|
||||
inference_test(recognize_digits ARGS mlp)
|
||||
inference_test(image_classification ARGS vgg resnet)
|
||||
inference_test(label_semantic_roles)
|
||||
inference_test(rnn_encoder_decoder)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue