diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39f876bc9e..d7e7e49e9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,11 +109,9 @@ else()
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
 
     if(WITH_AVX)
-        if(AVX_FOUND)
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -mavx")
-        endif(AVX_FOUND)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
     else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -msse3")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
     if(WITH_DSO)
@@ -138,11 +136,11 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)
 
 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
 else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
 endif(WITH_AVX)
 
 if(WITH_PYTHON)
diff --git a/cmake/FindAVX.cmake b/cmake/FindAVX.cmake
index f6103c6e66..d380c996df 100644
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@@ -3,36 +3,55 @@
 
 INCLUDE(CheckCXXSourceRuns)
 
-SET(FIND_AVX_10)
-SET(FIND_AVX_20)
-SET(AVX_FLAGS)
-SET(AVX_FOUND)
-
-# Check AVX 2
-SET(CMAKE_REQUIRED_FLAGS)
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
-ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
-  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+    set(MMX_FLAG "-mmmx")
+    set(SSE2_FLAG "-msse2")
+    set(SSE3_FLAG "-msse3")
+    SET(AVX_FLAG "-mavx")
+    SET(AVX2_FLAG "-mavx2")
+ELSEIF(MSVC)
+    set(MMX_FLAG "/arch:MMX")
+    set(SSE2_FLAG "/arch:SSE2")
+    set(SSE3_FLAG "/arch:SSE3")
+    SET(AVX_FLAG "/arch:AVX")
+    SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 
+# Check  MMX
+set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
+#include <mmintrin.h>
 int main()
 {
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
+    _mm_setzero_si64();
     return 0;
-}" FIND_AVX_20)
+}" MMX_FOUND)
 
-# Check AVX
-SET(CMAKE_REQUIRED_FLAGS)
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    SET(CMAKE_REQUIRED_FLAGS "-mavx")
-ELSEIF(MSVC AND NOT CMAKE_CL_64)
-    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
-endif()
+# Check SSE2
+set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <emmintrin.h>
+int main()
+{
+    _mm_setzero_si128();
+    return 0;
+}" SSE2_FOUND)
 
+# Check SSE3
+set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <pmmintrin.h>
+int main()
+{
+    __m128d a = _mm_set1_pd(6.28);
+    __m128d b = _mm_set1_pd(3.14);
+    __m128d result = _mm_addsub_pd(a, b);
+    result = _mm_movedup_pd(result);
+    return 0;
+}" SSE3_FOUND)
+
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -41,25 +60,17 @@ int main()
     __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
     __m256 result = _mm256_add_ps (a, b);
     return 0;
-}" FIND_AVX_10)
-
-IF(${FIND_AVX_20})
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
-    ELSEIF(MSVC)
-        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
-    ENDIF()
-ENDIF()
+}" AVX_FOUND)
 
-IF(${FIND_AVX_10})
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
-    ELSEIF(MSVC)
-        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
-    ENDIF()
-ENDIF()
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)
 
-IF(${FIND_AVX_10})
-    SET(AVX_FOUND TRUE)
-    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
-ENDIF()
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore
index 76961dd143..6a05b8f663 100644
--- a/demo/image_classification/.gitignore
+++ b/demo/image_classification/.gitignore
@@ -5,3 +5,5 @@ plot.png
 train.log
 image_provider_copy_1.py
 *pyc
+train.list
+test.list
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
old mode 100644
new mode 100755
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
index 9e2f8b8949..305efbcdc6 100644
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -58,24 +58,29 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
     settings.logger.info('DataProvider Initialization finished')
 
 
-@provider(init_hook=hook)
-def processData(settings, file_name):
+@provider(init_hook=hook, min_pool_size=0)
+def processData(settings, file_list):
     """
     The main function for loading data.
     Load the batch, iterate all the images and labels in this batch.
-    file_name: the batch file name.
+    file_list: the batch file list.
     """
-    data = cPickle.load(io.open(file_name, 'rb'))
-    indexes = list(range(len(data['images'])))
-    if settings.is_train:
-        random.shuffle(indexes)
-    for i in indexes:
-        if settings.use_jpeg == 1:
-            img = image_util.decode_jpeg(data['images'][i])
-        else:
-            img = data['images'][i]
-        img_feat = image_util.preprocess_img(img, settings.img_mean,
-                                             settings.img_size, settings.is_train,
-                                             settings.color)
-        label = data['labels'][i]
-        yield img_feat.tolist(), int(label)
+    with open(file_list, 'r') as fdata:
+        lines = [line.strip() for line in fdata]
+        random.shuffle(lines)
+        for file_name in lines:
+            with io.open(file_name.strip(), 'rb') as file:
+                data = cPickle.load(file)
+                indexes = list(range(len(data['images'])))
+                if settings.is_train:
+                    random.shuffle(indexes)
+                for i in indexes:
+                    if settings.use_jpeg == 1:
+                        img = image_util.decode_jpeg(data['images'][i])
+                    else:
+                        img = data['images'][i]
+                    img_feat = image_util.preprocess_img(img, settings.img_mean,
+                                                         settings.img_size, settings.is_train,
+                                                         settings.color)
+                    label = data['labels'][i]
+                    yield img_feat.astype('float32'), int(label)
diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py
index 0286a5d7e9..fe7ea19bf0 100755
--- a/demo/image_classification/preprocess.py
+++ b/demo/image_classification/preprocess.py
@@ -35,6 +35,8 @@ if __name__ == '__main__':
      data_creator = ImageClassificationDatasetCreater(data_dir,
                                                       processed_image_size,
                                                       color)
+     data_creator.train_list_name = "train.txt"
+     data_creator.test_list_name = "test.txt"
      data_creator.num_per_batch = 1000
      data_creator.overwrite = True
      data_creator.create_batches()
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
index dfe3eb95d1..e3e86ff106 100755
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@@ -17,3 +17,6 @@ set -e
 data_dir=./data/cifar-out
 
 python preprocess.py -i $data_dir -s 32 -c 1
+
+echo "data/cifar-out/batches/train.txt" > train.list
+echo "data/cifar-out/batches/test.txt" > test.list
diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py
index e8b8af4bd3..edd6988c48 100755
--- a/demo/image_classification/vgg_16_cifar.py
+++ b/demo/image_classification/vgg_16_cifar.py
@@ -25,8 +25,8 @@ if not is_predict:
           'img_size': 32,'num_classes': 10,
           'use_jpeg': 1,'color': "color"}
 
-  define_py_data_sources2(train_list=data_dir+"train.list",
-                          test_list=data_dir+'test.list',
+  define_py_data_sources2(train_list="train.list",
+                          test_list="train.list",
                           module='image_provider',
                           obj='processData',
                           args=args)
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/cluster/opensource/cluster_train.md
index 4763ede39b..cb493a88f0 100644
--- a/doc/cluster/opensource/cluster_train.md
+++ b/doc/cluster/opensource/cluster_train.md
@@ -1,26 +1,24 @@
-# Cluster Training
+# Distributed Training
 
-We provide some simple scripts ```paddle/scripts/cluster_train``` to help you to launch cluster training Job to harness PaddlePaddle's distributed trainning. For MPI and other cluster scheduler refer this naive script to implement more robust cluster training platform by yourself.
+In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-The following cluster demo is based on RECOMMENDATION local training demo in PaddlePaddle ```demo/recommendation``` directory.  Assuming you enter the ```paddle/scripts/cluster_train/``` directory.
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
 
-## Pre-requirements
+## Prerequisite
 
-Firstly,
+1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
 
-```bash
+   ```bash
 pip install fabric
-```
-
-Secondly, go through installing scripts to install PaddlePaddle at all nodes to make sure demo can run as local mode. For CUDA enabled training, we assume that CUDA is installed in ```/usr/local/cuda```, otherwise missed cuda runtime libraries error could be reported at cluster runtime. In one word, the local training environment should be well prepared for the simple scripts.
+   ```
 
-Then you should prepare same ROOT_DIR directory in all nodes. ROOT_DIR is from in cluster_train/conf.py. Assuming that the ROOT_DIR = /home/paddle, you can create ```paddle``` user account as well, at last ```paddle.py``` can ssh connections to all nodes with ```paddle``` user automatically.
+1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
 
-At last you can create ssh mutual trust relationship between all nodes for easy ssh login, otherwise ```password``` should be provided at runtime from ```paddle.py```.
+1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
 
 ## Prepare Job Workspace
 
-```Job workspace``` is defined as one package directory which contains dependency libraries, train data, test data, model config file and all other related file dependencies.
+We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
 
 These ```train/test``` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index c0b5d6e357..b564b96903 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -48,7 +48,7 @@ static inline std::string join(const std::string& part1, const std::string& part
 
 static inline void GetDsoHandleFromDefaultPath(
         std::string& dso_path, void** dso_handle, int dynload_flags) {
-    LOG(INFO) << "Try to find cuda library: " << dso_path
+    VLOG(3) << "Try to find cuda library: " << dso_path
               << " from default system path.";
     // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
index 52378fe7a4..eb646b4a71 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
@@ -1 +1 @@
-*protostr
+protostr/*.unitest