Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_depthwiseConv_op_gpu
commit
3074ae7b8d
@ -0,0 +1,18 @@
|
||||
#FROM python:2.7.14
|
||||
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
|
||||
RUN apt-get update && apt-get install -y python
|
||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
RUN pip install paddlepaddle
|
||||
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
||||
RUN pip uninstall -y paddlepaddle
|
||||
|
||||
# below lines may change a lot for debugging
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl && \
|
||||
chmod +x /usr/bin/paddle_k8s
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
@ -0,0 +1,76 @@
|
||||
# Performance for Distributed vgg16
|
||||
|
||||
## Test Result
|
||||
|
||||
### Hardware Infomation
|
||||
|
||||
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
|
||||
- cpu MHz : 2101.000
|
||||
- cache size : 20480 KB
|
||||
|
||||
### Single Node Single Thread
|
||||
|
||||
- PServer Count: 10
|
||||
- Trainer Count: 20
|
||||
- Metrics: samples / sec
|
||||
|
||||
| Batch Size | 32 | 64 | 128 | 256 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
|
||||
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### Different Batch Size
|
||||
|
||||
- PServer Count: 10
|
||||
- Trainer Count: 20
|
||||
- Per trainer CPU Core: 1
|
||||
- Metrics: samples / sec
|
||||
|
||||
| Batch Size | 32 | 64 | 128 | 256 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
|
||||
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
|
||||
### Accelerate Rate
|
||||
|
||||
- Pserver Count: 20
|
||||
- Batch Size: 128
|
||||
- Metrics: samples / sec
|
||||
|
||||
| Trainer Count | 20 | 40 | 80 | 100 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
|
||||
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### Different Pserver Count
|
||||
|
||||
- Trainer Count: 60
|
||||
- Batch Size: 128
|
||||
- Metrics: samples/ sec
|
||||
|
||||
| PServer Count | 3 | 6 |10 | 20 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
|
||||
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
*The performance gap between Fuild and v2 comes from the network interference.*
|
||||
|
||||
|
||||
## Steps to Run the Performance Test
|
||||
|
||||
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
|
||||
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
|
||||
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
|
||||
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
|
||||
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
|
||||
|
||||
Check the logs for the distributed training progress and analyze the performance.
|
||||
|
||||
## Enable Verbos Logs
|
||||
|
||||
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
|
@ -0,0 +1,72 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: ReplicaSet
|
||||
metadata:
|
||||
name: vgg16job-pserver
|
||||
spec:
|
||||
replicas: 10
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
paddle-job-pserver: vgg16job
|
||||
spec:
|
||||
hostNetwork: true
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
containers:
|
||||
- name: pserver
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PADDLE_JOB_NAME
|
||||
value: vgg16job
|
||||
- name: MKL_NUM_THREADS
|
||||
value: "1"
|
||||
- name: TRAINING_ROLE
|
||||
value: "PSERVER"
|
||||
- name: TRAINERS
|
||||
value: "20"
|
||||
- name: PSERVERS
|
||||
value: "10"
|
||||
- name: TOPOLOGY
|
||||
value: ""
|
||||
- name: ENTRY
|
||||
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: PADDLE_INIT_PORT
|
||||
value: "30236"
|
||||
- name: PADDLE_INIT_NICS
|
||||
value: "xgbe0"
|
||||
- name: PADDLE_INIT_TRAINER_COUNT
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
value: "20"
|
||||
- name: PADDLE_INIT_NUM_PASSES
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_USE_GPU
|
||||
value: "0"
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: "/usr/local/lib:/usr/local/nvidia/lib64"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
command: ["paddle_k8s", "start_fluid"]
|
||||
resources:
|
||||
requests:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
@ -0,0 +1,69 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: vgg16job-trainer
|
||||
spec:
|
||||
parallelism: 20
|
||||
completions: 20
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
paddle-job: vgg16job
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: trainer
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["paddle_k8s", "start_fluid"]
|
||||
env:
|
||||
- name: PADDLE_JOB_NAME
|
||||
value: vgg16job
|
||||
- name: TRAINING_ROLE
|
||||
value: "TRAINER"
|
||||
- name: TRAINERS
|
||||
value: "20"
|
||||
- name: PSERVERS
|
||||
value: "10"
|
||||
- name: TOPOLOGY
|
||||
value: ""
|
||||
- name: ENTRY
|
||||
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: PADDLE_INIT_PORT
|
||||
value: "30236"
|
||||
- name: PADDLE_INIT_NICS
|
||||
value: "xgbe0"
|
||||
- name: PADDLE_INIT_TRAINER_COUNT
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
value: "20"
|
||||
- name: PADDLE_INIT_NUM_PASSES
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_USE_GPU
|
||||
value: "0"
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: "/usr/local/lib:/usr/local/nvidia/lib64"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
resources:
|
||||
requests:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
limits:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
restartPolicy: Never
|
@ -0,0 +1,64 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: ReplicaSet
|
||||
metadata:
|
||||
name: vgg16v2job-pserver
|
||||
spec:
|
||||
replicas: 10
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
paddle-job-pserver: vgg16v2job
|
||||
spec:
|
||||
hostNetwork: true
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
containers:
|
||||
- name: pserver
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PADDLE_JOB_NAME
|
||||
value: vgg16v2job
|
||||
- name: TRAINERS
|
||||
value: "20"
|
||||
- name: PSERVERS
|
||||
value: "10"
|
||||
- name: TOPOLOGY
|
||||
value: ""
|
||||
- name: ENTRY
|
||||
value: "python train.py"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: PADDLE_INIT_PORT
|
||||
value: "30236"
|
||||
- name: PADDLE_INIT_NICS
|
||||
value: "xgbe0"
|
||||
- name: PADDLE_INIT_TRAINER_COUNT
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
value: "20"
|
||||
- name: PADDLE_INIT_NUM_PASSES
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_USE_GPU
|
||||
value: "0"
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: "/usr/local/lib:/usr/local/nvidia/lib64"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
command: ["paddle_k8s", "start_pserver"]
|
||||
resources:
|
||||
requests:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
@ -0,0 +1,65 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: vgg16v2job-trainer
|
||||
spec:
|
||||
parallelism: 20
|
||||
completions: 20
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
paddle-job: vgg16v2job
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: trainer
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["paddle_k8s", "start_trainer", "v2"]
|
||||
env:
|
||||
- name: PADDLE_JOB_NAME
|
||||
value: vgg16v2job
|
||||
- name: BATCH_SIZE
|
||||
value: "256"
|
||||
- name: TRAINERS
|
||||
value: "20"
|
||||
- name: PSERVERS
|
||||
value: "10"
|
||||
- name: TOPOLOGY
|
||||
value: ""
|
||||
- name: ENTRY
|
||||
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: PADDLE_INIT_PORT
|
||||
value: "30236"
|
||||
- name: PADDLE_INIT_NICS
|
||||
value: "xgbe0"
|
||||
- name: PADDLE_INIT_TRAINER_COUNT
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
|
||||
value: "1"
|
||||
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
value: "20"
|
||||
- name: PADDLE_INIT_NUM_PASSES
|
||||
value: "2"
|
||||
- name: PADDLE_INIT_USE_GPU
|
||||
value: "0"
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: "/usr/local/lib:/usr/local/nvidia/lib64"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
resources:
|
||||
requests:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
limits:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
restartPolicy: Never
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,154 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import gzip
|
||||
|
||||
import paddle.v2.dataset.cifar as cifar
|
||||
import paddle.v2 as paddle
|
||||
import time
|
||||
import os
|
||||
|
||||
DATA_DIM = 3 * 32 * 32
|
||||
CLASS_DIM = 10
|
||||
BATCH_SIZE = os.getenv("BATCH_SIZE")
|
||||
if BATCH_SIZE:
|
||||
BATCH_SIZE = int(BATCH_SIZE)
|
||||
else:
|
||||
BATCH_SIZE = 128
|
||||
print "batch_size", BATCH_SIZE
|
||||
NODE_COUNT = int(os.getenv("TRAINERS"))
|
||||
ts = 0
|
||||
|
||||
|
||||
def vgg(input, nums, class_dim):
|
||||
def conv_block(input, num_filter, groups, num_channels=None):
|
||||
return paddle.networks.img_conv_group(
|
||||
input=input,
|
||||
num_channels=num_channels,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
conv_num_filter=[num_filter] * groups,
|
||||
conv_filter_size=3,
|
||||
conv_act=paddle.activation.Relu(),
|
||||
pool_type=paddle.pooling.Max())
|
||||
|
||||
assert len(nums) == 5
|
||||
# the channel of input feature is 3
|
||||
conv1 = conv_block(input, 64, nums[0], 3)
|
||||
conv2 = conv_block(conv1, 128, nums[1])
|
||||
conv3 = conv_block(conv2, 256, nums[2])
|
||||
conv4 = conv_block(conv3, 512, nums[3])
|
||||
conv5 = conv_block(conv4, 512, nums[4])
|
||||
|
||||
fc_dim = 512
|
||||
fc1 = paddle.layer.fc(input=conv5,
|
||||
size=fc_dim,
|
||||
act=paddle.activation.Relu(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5))
|
||||
fc2 = paddle.layer.fc(input=fc1,
|
||||
size=fc_dim,
|
||||
act=paddle.activation.Relu(),
|
||||
layer_attr=paddle.attr.Extra(drop_rate=0.5))
|
||||
out = paddle.layer.fc(input=fc2,
|
||||
size=class_dim,
|
||||
act=paddle.activation.Softmax())
|
||||
return out
|
||||
|
||||
|
||||
def vgg13(input, class_dim):
|
||||
nums = [2, 2, 2, 2, 2]
|
||||
return vgg(input, nums, class_dim)
|
||||
|
||||
|
||||
def vgg16(input, class_dim):
|
||||
nums = [2, 2, 3, 3, 3]
|
||||
return vgg(input, nums, class_dim)
|
||||
|
||||
|
||||
def vgg19(input, class_dim):
|
||||
nums = [2, 2, 4, 4, 4]
|
||||
return vgg(input, nums, class_dim)
|
||||
|
||||
|
||||
def main():
|
||||
global ts
|
||||
paddle.init(use_gpu=False)
|
||||
image = paddle.layer.data(
|
||||
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
|
||||
lbl = paddle.layer.data(
|
||||
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
|
||||
|
||||
extra_layers = None
|
||||
# NOTE: for v2 distributed training need averaging updates.
|
||||
learning_rate = 1e-3 / NODE_COUNT
|
||||
out = vgg16(image, class_dim=CLASS_DIM)
|
||||
cost = paddle.layer.classification_cost(input=out, label=lbl)
|
||||
|
||||
# Create parameters
|
||||
parameters = paddle.parameters.create(cost)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = paddle.optimizer.Momentum(
|
||||
momentum=0.9,
|
||||
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
|
||||
BATCH_SIZE),
|
||||
learning_rate=learning_rate / BATCH_SIZE,
|
||||
learning_rate_decay_a=0.1,
|
||||
learning_rate_decay_b=128000 * 35,
|
||||
learning_rate_schedule="discexp", )
|
||||
|
||||
train_reader = paddle.batch(
|
||||
paddle.reader.shuffle(
|
||||
cifar.train10(),
|
||||
# To use other data, replace the above line with:
|
||||
# reader.train_reader('train.list'),
|
||||
buf_size=1000),
|
||||
batch_size=BATCH_SIZE)
|
||||
test_reader = paddle.batch(
|
||||
cifar.test10(),
|
||||
# To use other data, replace the above line with:
|
||||
# reader.test_reader('val.list'),
|
||||
batch_size=BATCH_SIZE)
|
||||
|
||||
# Create trainer
|
||||
trainer = paddle.trainer.SGD(cost=cost,
|
||||
parameters=parameters,
|
||||
update_equation=optimizer,
|
||||
extra_layers=extra_layers,
|
||||
is_local=False)
|
||||
|
||||
# End batch and end pass event handler
|
||||
def event_handler(event):
|
||||
global ts, ts_pass
|
||||
if isinstance(event, paddle.event.BeginPass):
|
||||
ts_pass = time.time()
|
||||
if isinstance(event, paddle.event.BeginIteration):
|
||||
ts = time.time()
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 1 == 0:
|
||||
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
|
||||
event.pass_id, event.batch_id, event.cost, event.metrics,
|
||||
time.time() - ts)
|
||||
if isinstance(event, paddle.event.EndPass):
|
||||
print "Pass %d end, spent: %f" % (event.pass_id,
|
||||
time.time() - ts_pass)
|
||||
result = trainer.test(reader=test_reader)
|
||||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
|
||||
|
||||
trainer.train(
|
||||
reader=train_reader, num_passes=200, event_handler=event_handler)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,32 @@
|
||||
### Design Doc: Switch
|
||||
|
||||
### Background
|
||||
|
||||
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
|
||||
|
||||
The following example shows the usage of `fluid.switch`.
|
||||
|
||||
```python
|
||||
a = fluid.Var(10)
|
||||
b = fluid.Var(0)
|
||||
|
||||
switch = fluid.switch()
|
||||
with switch.block():
|
||||
with switch.case(fluid.less_equal(a, 10)):
|
||||
fluid.print("Case 1")
|
||||
with switch.case(fluid.larger(a, 0)):
|
||||
fluid.print("Case 2")
|
||||
with switch.default():
|
||||
fluid.print("Case 3")
|
||||
```
|
||||
|
||||
### The Semantics
|
||||
|
||||
1. A `switch` control-flow checks cases one-by-one.
|
||||
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
|
||||
1. It runs the first matched case, or the default case if there is one.
|
||||
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
|
||||
|
||||
The above program should print and print only "Case 1".
|
||||
|
||||
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
|
@ -0,0 +1,121 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/operators/box_coder_op.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
class BoxCoderOp : public framework::OperatorWithKernel {
|
||||
public:
|
||||
using framework::OperatorWithKernel::OperatorWithKernel;
|
||||
|
||||
protected:
|
||||
void InferShape(framework::InferShapeContext *ctx) const override {
|
||||
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
|
||||
"Input(PriorBox) of BoxCoderOp should not be null.");
|
||||
PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
|
||||
"Input(PriorBoxVar) of BoxCoderOp should not be null.");
|
||||
PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
|
||||
"Input(TargetBox) of BoxCoderOp should not be null.");
|
||||
PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
|
||||
"Output(OutputBox) of BoxCoderOp should not be null.");
|
||||
|
||||
auto prior_box_dims = ctx->GetInputDim("PriorBox");
|
||||
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
|
||||
auto target_box_dims = ctx->GetInputDim("TargetBox");
|
||||
|
||||
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
|
||||
"The rank of Input of PriorBoxVar must be 2");
|
||||
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
|
||||
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
|
||||
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
|
||||
"The rank of Input of TargetBox must be 2");
|
||||
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
|
||||
"The shape of TargetBox is [M, 4]");
|
||||
|
||||
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
|
||||
|
||||
ctx->SetOutputDim(
|
||||
"OutputBox",
|
||||
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
|
||||
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
|
||||
}
|
||||
};
|
||||
|
||||
class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
|
||||
public:
|
||||
BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
|
||||
: OpProtoAndCheckerMaker(proto, op_checker) {
|
||||
AddInput(
|
||||
"PriorBox",
|
||||
"(Tensor, default Tensor<float>) "
|
||||
"Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
|
||||
"each box is represented as [xmin, ymin, xmax, ymax], "
|
||||
"[xmin, ymin] is the left top coordinate of the anchor box, "
|
||||
"if the input is image feature map, they are close to the origin "
|
||||
"of the coordinate system. [xmax, ymax] is the right bottom "
|
||||
"coordinate of the anchor box.");
|
||||
AddInput("PriorBoxVar",
|
||||
"(Tensor, default Tensor<float>) "
|
||||
"PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
|
||||
"of variance.");
|
||||
AddInput(
|
||||
"TargetBox",
|
||||
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
|
||||
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
|
||||
"[xmin, ymin] is the left top coordinate of the box if the input "
|
||||
"is image feature map, they are close to the origin of the coordinate "
|
||||
"system. [xmax, ymax] is the right bottom coordinate of the box. "
|
||||
"This tensor can contain LoD information to represent a batch "
|
||||
"of inputs. One instance of this batch can contain different "
|
||||
"numbers of entities.");
|
||||
AddAttr<std::string>("code_type",
|
||||
"(string, default encode_center_size) "
|
||||
"the code type used with the target box")
|
||||
.SetDefault("encode_center_size")
|
||||
.InEnum({"encode_center_size", "decode_center_size"});
|
||||
AddOutput(
|
||||
"OutputBox",
|
||||
"(LoDTensor or Tensor) "
|
||||
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
|
||||
"representing the result of N target boxes encoded/decoded with "
|
||||
"M Prior boxes and variances.");
|
||||
|
||||
AddComment(R"DOC(
|
||||
Bounding Box Coder Operator.
|
||||
Encode/Decode the target bounding box with the priorbox information.
|
||||
The Encoding schema described below:
|
||||
ox = (tx - px) / pw / pxv
|
||||
oy = (ty - py) / ph / pyv
|
||||
ow = log(abs(tw / pw)) / pwv
|
||||
oh = log(abs(th / ph)) / phv
|
||||
The Decoding schema described below:
|
||||
ox = (pw * pxv * tx * + px) - tw / 2
|
||||
oy = (ph * pyv * ty * + py) - th / 2
|
||||
ow = exp(pwv * tw) * pw + tw / 2
|
||||
oh = exp(phv * th) * ph + th / 2
|
||||
where tx, ty, tw, th denote the target box's center coordinates, width and
|
||||
height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
|
||||
center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
|
||||
of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
|
||||
width and height.
|
||||
)DOC");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
|
||||
REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
|
||||
ops::BoxCoderKernel<double>);
|
@ -0,0 +1,150 @@
|
||||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/operators/box_coder_op.h"
|
||||
#include "paddle/platform/cuda_helper.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
template <typename T>
|
||||
__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
|
||||
const T* prior_box_var_data,
|
||||
const T* target_box_data, const int row,
|
||||
const int col, const int len,
|
||||
T* output) {
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx < row * col) {
|
||||
const int row_idx = idx / col;
|
||||
const int col_idx = idx % col;
|
||||
T prior_box_width =
|
||||
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
|
||||
T prior_box_height =
|
||||
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
|
||||
T prior_box_center_x =
|
||||
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
|
||||
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
|
||||
prior_box_data[col_idx * len + 1]) /
|
||||
2;
|
||||
|
||||
T target_box_center_x =
|
||||
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
|
||||
2;
|
||||
T target_box_center_y = (target_box_data[row_idx * len + 3] +
|
||||
target_box_data[row_idx * len + 1]) /
|
||||
2;
|
||||
T target_box_width =
|
||||
target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
|
||||
T target_box_height =
|
||||
target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
|
||||
|
||||
output[idx * len] = (target_box_center_x - prior_box_center_x) /
|
||||
prior_box_width / prior_box_var_data[col_idx * len];
|
||||
output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
|
||||
prior_box_height /
|
||||
prior_box_var_data[col_idx * len + 1];
|
||||
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
|
||||
prior_box_var_data[col_idx * len + 2];
|
||||
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
|
||||
prior_box_var_data[col_idx * len + 3];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
|
||||
const T* prior_box_var_data,
|
||||
const T* target_box_data, const int row,
|
||||
const int col, const int len,
|
||||
T* output) {
|
||||
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx < row * col) {
|
||||
const int row_idx = idx / col;
|
||||
const int col_idx = idx % col;
|
||||
T prior_box_width =
|
||||
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
|
||||
T prior_box_height =
|
||||
prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
|
||||
T prior_box_center_x =
|
||||
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
|
||||
T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
|
||||
prior_box_data[col_idx * len + 1]) /
|
||||
2;
|
||||
|
||||
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
|
||||
target_box_data[row_idx * len + 2]) *
|
||||
prior_box_width;
|
||||
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
|
||||
target_box_data[row_idx * len + 3]) *
|
||||
prior_box_height;
|
||||
T target_box_center_x = prior_box_var_data[col_idx * len] *
|
||||
target_box_data[row_idx * len] *
|
||||
prior_box_width +
|
||||
prior_box_center_x;
|
||||
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
|
||||
target_box_data[row_idx * len + 1] *
|
||||
prior_box_height +
|
||||
prior_box_center_y;
|
||||
|
||||
output[idx * len] = target_box_center_x - target_box_width / 2;
|
||||
output[idx * len + 1] = target_box_center_y - target_box_height / 2;
|
||||
output[idx * len + 2] = target_box_center_x + target_box_width / 2;
|
||||
output[idx * len + 3] = target_box_center_y + target_box_height / 2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class BoxCoderCUDAKernel : public framework::OpKernel<T> {
|
||||
public:
|
||||
void Compute(const framework::ExecutionContext& context) const override {
|
||||
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
|
||||
"This kernel only runs on GPU device.");
|
||||
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
|
||||
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
|
||||
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
|
||||
auto* output_box = context.Output<framework::Tensor>("OutputBox");
|
||||
|
||||
if (target_box->lod().size()) {
|
||||
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
|
||||
"Only support 1 level of LoD.");
|
||||
}
|
||||
auto row = target_box->dims()[0];
|
||||
auto col = prior_box->dims()[0];
|
||||
auto len = prior_box->dims()[1];
|
||||
int block = 512;
|
||||
int grid = (row * col + block - 1) / block;
|
||||
auto& device_ctx = context.cuda_device_context();
|
||||
|
||||
const T* prior_box_data = prior_box->data<T>();
|
||||
const T* prior_box_var_data = prior_box_var->data<T>();
|
||||
const T* target_box_data = target_box->data<T>();
|
||||
|
||||
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
|
||||
T* output = output_box->data<T>();
|
||||
|
||||
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
|
||||
if (code_type == BoxCodeType::kEncodeCenterSize) {
|
||||
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
|
||||
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
|
||||
output);
|
||||
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
|
||||
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
|
||||
prior_box_data, prior_box_var_data, target_box_data, row, col, len,
|
||||
output);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
||||
|
||||
namespace ops = paddle::operators;
|
||||
REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
|
||||
ops::BoxCoderCUDAKernel<double>);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue