diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 1ac47212b5..4703944c87 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs): def process(settings, file_list): for i in xrange(1024): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() - lab = random.randint(0, settings.num_class) + lab = random.randint(0, settings.num_class - 1) yield img.astype('float32'), int(lab) diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh new file mode 100755 index 0000000000..5b0a037344 --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn.sh @@ -0,0 +1,51 @@ +set -e + +unset OMP_NUM_THREADS MKL_NUM_THREADS +export OMP_DYNAMIC="FALSE" +export KMP_AFFINITY="granularity=fine,compact,0,0" + +function train() { + topology=$1 + bs=$2 + use_mkldnn=$3 + if [ $3 == "True" ]; then + use_mkldnn=$3 + thread=1 + log="logs/${topology}-mkldnn-${bs}.log" + elif [ $3 == "False" ]; then + use_mkldnn=$3 + thread=`nproc` + log="logs/${topology}-${thread}mklml-${bs}.log" + else + echo "Wrong input $3, use True or False." + fi + args="batch_size=${bs}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + 2>&1 | tee ${log} +} + +if [ ! -d "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +#========= mkldnn =========# +# vgg +train vgg 64 True +train vgg 128 True +train vgg 256 True + +#========== mklml ===========# +train vgg 64 False +train vgg 128 False +train vgg 256 False diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py new file mode 100644 index 0000000000..b8429975f5 --- /dev/null +++ b/benchmark/paddle/image/vgg.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg('layer_num', int, 19) + +args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +define_py_data_sources2( + "train.list", None, module="provider", obj="process", args=args) + +settings( + batch_size=batch_size, + learning_rate=0.01 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + +img = data_layer(name='image', size=height * width * 3) + + +def vgg_network(vgg_num=3): + tmp = img_conv_group( + input=img, + num_channels=3, + conv_padding=1, + conv_num_filter=[64, 64], + conv_filter_size=3, + conv_act=ReluActivation(), + pool_size=2, + pool_stride=2, + pool_type=MaxPooling()) + + tmp = img_conv_group( + input=tmp, + conv_num_filter=[128, 128], + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + channels = [] + for i in range(vgg_num): + channels.append(256) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + channels = [] + for i in range(vgg_num): + channels.append(512) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 16: + vgg = vgg_network(3) +elif layer_num == 19: + vgg = vgg_network(4) +else: + print("Wrong layer number.") + +lab = data_layer('label', num_class) +loss = cross_entropy(input=vgg, label=lab) +outputs(loss) diff --git a/cmake/util.cmake b/cmake/util.cmake index ac911052eb..d1aee3e170 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) + if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) + target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") + endif() + add_dependencies(${TARGET_NAME} ${external_project_dependencies}) endfunction() diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h index 86ffe38736..40dd8c618a 100644 --- a/paddle/gserver/activations/MKLDNNActivation.h +++ b/paddle/gserver/activations/MKLDNNActivation.h @@ -100,6 +100,7 @@ public: if (cnt_ == act.value->getElementCnt()) { return; } + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; cnt_ = act.value->getElementCnt(); stream_.reset(new MKLDNNStream()); auto eng = CPUEngine::Instance().getEngine(); @@ -110,7 +111,6 @@ public: float alpha = getAlpha(); float beta = getBeta(); - /// forward pipelineFwd_.clear(); val_ = std::dynamic_pointer_cast(act.value); if (val_ == nullptr) { @@ -152,6 +152,7 @@ public: if (!needResetBwd_) { return; } + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; needResetBwd_ = false; mkldnn::algorithm algo = getAlgo(this->getName()); float alpha = getBwdAlpha(); diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 88b047c89b..9a0abd291a 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -64,7 +64,7 @@ bool MKLDNNConvLayer::init(const LayerMap& layerMap, // create biases if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, oc_, biasParameter_)); + biases_ = std::unique_ptr(new Weight(1, oc_, biasParameter_, 0)); } return true; } @@ -251,22 +251,31 @@ void MKLDNNConvLayer::resetInValue( // create buffer and reorder if input value do not match cpuInVal_ = nullptr; cvtInVal_ = nullptr; - if (inputIsOnlyMKLDNN()) { - MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast(inMat); - CHECK(dnnIn) << "Input should be MKLDNNMatrix"; - if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) { - CHECK_EQ(dnnIn->getFormat(), format::nc); + + MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast(inMat); + CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr); + if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { + in = dnnIn; + return; + } + if (dnnIn) { + if (dnnIn->getFormat() == format::nc) { CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format"; // create a new one with nchw format and same data memory::dims inDims = memory::dims{bs_, ic_, 1, 1}; dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); - CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()); } - in = dnnIn; + if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { + in = dnnIn; + return; + } + cpuInVal_ = dnnIn; + in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); + cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in); + CHECK(cvtInVal_) << "should not be emptry"; } else { - const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; - cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_); + cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) { // create new mkldnn matrix in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); @@ -535,7 +544,7 @@ void MKLDNNConvLayer::resetWgtValBwdData( } else { wgtValBwdData_ = wgtVal_; } - VLOG(MKLDNN_FMTS) << "weight value format for backward data" + VLOG(MKLDNN_FMTS) << "weight value format for backward data: " << wgtValBwdData_->getFormat(); } diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index afd092666b..8cbfbd0d2b 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -49,7 +49,7 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap, // create biases if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, oc_, biasParameter_)); + biases_ = std::unique_ptr(new Weight(1, oc_, biasParameter_, 0)); } return true; } @@ -161,9 +161,16 @@ void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) { void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias) { + format wgtFmt = format::oihw; + if (inVal_->getFormat() == format::nChw8c) { + wgtFmt = format::oIhw8i; + } else if (inVal_->getFormat() == format::nChw16c) { + wgtFmt = format::oIhw16i; + } wgt = MKLDNNMatrix::create( - weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_); + weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_); wgt->downSpatial(); + VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat(); bias = (biases_ && biases_->getW()) ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_) diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index d8555a8331..c09fd89462 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -115,6 +115,7 @@ public: copySeqInfoToOutputs(); size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt(); if (inputElemenCnt_ != elemenCnt) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; // reset when input total sizes changed, not only the batchsize inputElemenCnt_ = elemenCnt; reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); @@ -142,6 +143,7 @@ public: void backward(const UpdateCallback& callback) override { if (needResetBwd_) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); needResetBwd_ = false; } diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index f01ad4142d..066837ca95 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -37,6 +37,19 @@ add_test(NAME test_CompareTwoNets --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +################ test_CompareMKLDNNandCPU ###################### +if(WITH_MKLDNN) + add_unittest_without_exec(test_CompareMKLDNNandCPU + test_CompareTwoNets.cpp) + add_test(NAME test_CompareMKLDNNandCPU + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU + --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True + --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False + --use_gpu=False + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endif() + ############### test_CompareTwoOpts ################### add_unittest_without_exec(test_CompareTwoOpts test_CompareTwoOpts.cpp) diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf new file mode 100644 index 0000000000..77f7816153 --- /dev/null +++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf @@ -0,0 +1,63 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +################################### Data Configuration ################################### +TrainData(ProtoData(files = "trainer/tests/mnist.list")) +################################### Algorithm Configuration ################################### +settings(batch_size = 1000, + learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) +################################### Network Configuration ################################### +data = data_layer(name ="input", size=784) + +tmp = img_conv_layer(input=data, + num_channels=1, + filter_size=3, + num_filters=32, + padding=1, + shared_biases=True, + act=ReluActivation()) + +tmp = img_pool_layer(input=tmp, + pool_size=3, + stride=2, + padding=1, + pool_type=AvgPooling()) + +tmp = img_conv_layer(input=tmp, + filter_size=3, + num_filters=64, + padding=1, + shared_biases=True, + act=ReluActivation()) + +tmp = img_pool_layer(input=tmp, + pool_size=3, + stride=2, + padding=1, + pool_type=MaxPooling()) + +tmp = fc_layer(input=tmp, size=64, + bias_attr=True, + act=ReluActivation()) + +output = fc_layer(input=tmp, size=10, + bias_attr=True, + act=SoftmaxActivation()) + +lbl = data_layer(name ="label", size=10) + +cost = classification_cost(input=output, label=lbl) +outputs(cost) diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp index 94f65e545d..307645d2c3 100644 --- a/paddle/trainer/tests/test_CompareTwoNets.cpp +++ b/paddle/trainer/tests/test_CompareTwoNets.cpp @@ -26,12 +26,15 @@ DECLARE_int32(gpu_id); DECLARE_bool(local); DECLARE_bool(use_gpu); +DECLARE_bool(use_mkldnn); DECLARE_string(config); DECLARE_string(nics); DEFINE_string(config_file_a, "", "config of one network to compare"); DEFINE_string(config_file_b, "", "config of another network to compare"); +DEFINE_bool(use_mkldnn_a, false, "whether to use mkldnn to run config_file_a"); +DEFINE_bool(use_mkldnn_b, false, "whether to use mkldnn to run config_file_b"); DEFINE_bool(need_high_accuracy, false, "whether need to run in double accuracy"); @@ -128,6 +131,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { matA.getWidth()); } + if (FLAGS_use_mkldnn_a || FLAGS_use_mkldnn_b) { + // some format of mkldnn parameter is different with cpu + // test_MKLDNN will check the parameters + return; + } + vector& parametersA = comDataA.parameters; vector& parametersB = comDataB.parameters; @@ -167,10 +176,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { TEST(Trainer, create) { ComData dataA; + FLAGS_use_mkldnn = FLAGS_use_mkldnn_a; calcGradient(dataA, FLAGS_config_file_a); LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n"; ComData dataB; + FLAGS_use_mkldnn = FLAGS_use_mkldnn_b; calcGradient(dataB, FLAGS_config_file_b); LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";