From 4fbba65626fec5eea2cf4eef8c7a81bd29690fe5 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 15 Nov 2017 15:31:51 +0800
Subject: [PATCH 1/5] auto set cpu env when mkldnn or mklml enabled for V1 API

---
 paddle/scripts/submit_local.sh.in | 47 +++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 5c4b5a2495..4bf25c69e3 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -43,6 +43,51 @@ function ver2num() {
   set +e
 }
 
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKLDNN or MKLML enabled
+  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+    return 0
+  fi
+  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKLDNN or MKLML enabled
+  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+    return 0
+  fi
+  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers)) 
+  if [ -z "$OMP_NUM_THREADS" ]; then
+    export OMP_NUM_THREADS=$threads
+  fi
+  if [ -z "$MKL_NUM_THREADS" ]; then
+    export MKL_NUM_THREADS=$threads
+  fi
+}
+
 PADDLE_CONF_HOME="$HOME/.config/paddle"
 mkdir -p ${PADDLE_CONF_HOME}
 
@@ -92,9 +137,11 @@ else:
   sys.exit(0)
 EOF
 
+cpu_config
 
 case "$1" in
     "train")
+        threads_config $@
         ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")

From a6f5f6efb69a14c7c8c654f36a08c467ceb7b258 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 15 Nov 2017 17:14:11 +0800
Subject: [PATCH 2/5] at least set threads number as 1, in case trainers number
 is larger than processors

---
 paddle/scripts/submit_local.sh.in | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 4bf25c69e3..1cc5078494 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -79,7 +79,10 @@ function threads_config() {
   if [ -z $trainers ]; then
     trainers=1
   fi
-  threads=$((processors / trainers)) 
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
   if [ -z "$OMP_NUM_THREADS" ]; then
     export OMP_NUM_THREADS=$threads
   fi

From d66d6c6ea355832243667ea5a01add40fb3e8f73 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 15 Nov 2017 17:21:27 +0800
Subject: [PATCH 3/5] auto set cpu environment in V2 API

---
 python/paddle/v2/__init__.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3d70513843..a55b9d7a21 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -78,6 +78,31 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    # auto set cpu environment
+    def set_env(key, value):
+        '''If the key has not been set in the environment, set it with value.'''
+        assert isinstance(key, str)
+        assert isinstance(value, str)
+        envset = os.environ.get(key)
+        if envset is None:
+            os.environ[key] = value
+
+    ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs")
+    ht = int(ht.read())
+    if ht == 1:  # ht is off
+        set_env("OMP_DYNAMIC", "false")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    else:
+        set_env("OMP_DYNAMIC", "true")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+    processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l")
+    processors = int(processors.read())
+    trainers = kwargs.get('trainer_count', 1)
+    threads = processors / trainers
+    threads = '1' if threads < 1 else str(threads)
+    set_env("OMP_NUM_THREADS", threads)
+    set_env("MKL_NUM_THREADS", threads)
+
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
     if 'use_mkldnn' in kwargs:

From a3b2b7b1c754f944db0fae8a015d84a5b1238652 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 15 Nov 2017 17:23:41 +0800
Subject: [PATCH 4/5] remove the hard code setting in benchmark scripts

---
 benchmark/paddle/image/run_mkldnn.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index a4527e0496..3cc779b48d 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -1,9 +1,7 @@
 set -e
 
 function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS
-  export OMP_DYNAMIC="FALSE"
-  export KMP_AFFINITY="granularity=fine,compact,0,0"
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
   topology=$1
   layer_num=$2
   bs=$3
@@ -14,8 +12,6 @@ function train() {
   elif [ $4 == "False" ]; then
     thread=`nproc`
     # each trainer_count use only 1 core to avoid conflict
-    export OMP_NUM_THREADS=1
-    export MKL_NUM_THREADS=1
     log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
   else
     echo "Wrong input $3, use True or False."

From 6337007ef5745977fdfdc9b6d051eefbd1e6260e Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 16 Nov 2017 11:35:36 +0800
Subject: [PATCH 5/5] add echo the result as comment

---
 paddle/scripts/submit_local.sh.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1cc5078494..b9a49526a7 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -141,10 +141,12 @@ else:
 EOF
 
 cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
     "train")
         threads_config $@
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
         ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")